[BACKEND] Don't allocate shmem for warps with repeated data in tt.scan (

#5910) It turns out that the previous changes within reduce to support LLs had already trimmed its shmem memory use to the right size.
triton-lang · Feb 13, 2025 · de650ad · de650ad
1 parent 464d1f1
commit de650ad
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 1 deletion.
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -267,7 +267,7 @@ bool ScanLoweringHelper::isSupported() {
 }
 
 unsigned ScanLoweringHelper::getScratchSizeInElems() {
-  unsigned numWarps = lookupNumWarps(scanOp);
+  unsigned numWarps = product(getEncoding().getWarpsPerCTA());
   unsigned numNonAxisElementsPerWarp =
       getNonAxisNumThreadsPerWarp() * getNonAxisNumElementsPerThread();
   unsigned numElements = numWarps * numNonAxisElementsPerWarp *

diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -615,4 +615,14 @@ tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {
   // CHECK-NEXT: size = 1024
 }
 
+// CHECK-LABEL: scan_alloc
+tt.func @scan_alloc(%x : tensor<8x16xf32, #AL>) {
+  // CHECK: offset = 0, size = 128
+  %a = "tt.scan"(%x) <{axis = 0 : i32, reverse = false}>({
+  ^bb0(%arg0: f32, %arg1: f32):
+    %add = arith.addf %arg0, %arg1 : f32
+    tt.scan.return %add : f32
+  }) : (tensor<8x16xf32, #AL>) -> tensor<8x16xf32, #AL>
+  tt.return
+}
 }