ROCm · poyenc · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
@@ -138,9 +138,8 @@ struct BlockFmhaFwdSplitKVCombinePipeline
         auto lse_accum = make_static_distributed_tensor<LSEDataType>(
             Policy::template MakeLSEaccRegTileDistribution<Problem>());
 
-        // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, max(kMaxSplits, warp_size)])
-        // this will extend the distributed tensor width so that each thread in wave have data to
-        // reduce.
+        // copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, kMaxSplits])
+        // and fill up -INF values outside the [kM0, num_splits] region.
         {
             constexpr auto spans = decltype(lse_accum)::get_distributed_spans();
             sweep_tile_span(spans[number<0>{}], [&](auto idx0) {

@@ -134,26 +134,30 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
     {
         constexpr index_t kBlockSize = Problem::kBlockSize;
 
-        constexpr index_t kNPerBlock = max(Problem::kMaxSplits, get_warp_size());
+        constexpr index_t kNPerBlock = Problem::kMaxSplits;
         constexpr index_t kMPerBlock = Problem::kM0;
 
-        constexpr index_t NThreads   = get_warp_size();
+        constexpr index_t NThreads   = 4;
         constexpr index_t NPerThread = kNPerBlock / NThreads;
 
-        constexpr index_t MThreads   = kBlockSize / NThreads;
-        constexpr index_t MPerThread = kMPerBlock / MThreads;
+        constexpr index_t MThreadPerWarp = get_warp_size() / NThreads;
+        constexpr index_t MWarps         = kMPerBlock / MThreadPerWarp;
+        constexpr index_t MThreads       = MThreadPerWarp * MWarps;
+        constexpr index_t MPerThread     = kMPerBlock / MThreads;
 
+        static_assert(kBlockSize % MWarps == 0);
         static_assert(NThreads * NPerThread == kNPerBlock);
         static_assert(MThreads * MPerThread == kMPerBlock);
 
+        // duplicate MWarps if less than (kBlockSize / get_warp_size())
         return make_static_tile_distribution(
             tile_distribution_encoding<
                 sequence<1>,
-                tuple<sequence<MThreads, MPerThread>, sequence<NThreads, NPerThread>>,
-                tuple<sequence<1>, sequence<2>>,
-                tuple<sequence<0>, sequence<0>>,
+                tuple<sequence<MWarps, MThreadPerWarp, MPerThread>, sequence<NThreads, NPerThread>>,
+                tuple<sequence<0, 1>, sequence<2, 1>>,
+                tuple<sequence<0, 0>, sequence<0, 1>>,
                 sequence<1, 2>,
-                sequence<1, 1>>{});
+                sequence<2, 1>>{});
     }
 
     template <typename Problem>