[BACKEND] Dead code tmem_alloc that are not mutable (#6047)

This sets the right side effects on tmem_alloc in order to have dead code eliminication and CSE kick off.
triton-lang · Feb 27, 2025 · 3fe035c · 3fe035c
1 parent 9f4c7fc
commit 3fe035c
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 2 deletions.
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -400,7 +400,7 @@ def TTNG_TMEMStoreOp : TTNG_Op<"tmem_store", [MemoryEffects<[MemWrite]>]> {
   let hasVerifier = 1;
 }
 
-def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [MemoryEffects<[MemWrite]>]> {
+def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "allocate tensor memory";
   let description = [{
     This operation allocates buffer in tensor memory and return a descriptor

diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -388,6 +388,25 @@ LogicalResult TMEMAllocOp::verify() {
   return success();
 }
 
+// TMEMAllocOp
+void TMEMAllocOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  Operation *op = getOperation();
+  // If allocation is immutable, mark it as no side effect allow things like
+  // CSE, DCE to work in early compiler passes.
+  // After the memory offset is computed, we attach the true side effect to the
+  // op.
+  if (!getType().getMutableMemory() && !op->hasAttr("tensor_memory_col_offset"))
+    return;
+  effects.emplace_back(MemoryEffects::Allocate::get(),
+                       mlir::triton::nvidia_gpu::TensorMemory::get());
+  if (getSrc())
+    effects.emplace_back(MemoryEffects::Write::get(),
+                         getOperation()->getOpResult(0),
+                         mlir::triton::nvidia_gpu::TensorMemory::get());
+}
+
 bool isDescendingOrder(triton::gpu::MemDescType type) {
   auto order = triton::gpu::getOrder(type);
   auto rank = type.getRank();

diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp
@@ -65,7 +65,7 @@ struct TCGen5MMAScaleSharedToTmemConversion
         cast<SwizzledSharedEncodingAttr>(oldType.getEncoding());
     CTALayoutAttr CTALayout = getCTALayout(oldEncoding);
     ArrayRef<unsigned> CTASplitNum = CTALayout.getCTASplitNum();
-    ArrayRef<int64_t> shape = oldType.getAllocShape();
+    ArrayRef<int64_t> shape = oldType.getShape();
     Attribute scaleEncoding = TensorMemoryScalesEncodingAttr::get(
         context, CTASplitNum[0], CTASplitNum[1]);
     Type scaleAType =