From 3fe035c7e6dbe3f7d3610d5977a2a9037d52e365 Mon Sep 17 00:00:00 2001
From: Thomas Raoux <thomas.raoux@openai.com>
Date: Thu, 27 Feb 2025 11:58:56 -0800
Subject: [PATCH] [BACKEND] Dead code tmem_alloc that are not mutable (#6047)

This sets the right side effects on tmem_alloc in order to have dead
code eliminication and CSE kick off.
---
 .../TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td  |  2 +-
 lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp        | 19 +++++++++++++++++++
 .../Transforms/MMALowering.cpp                |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
index 20ddc4ccc621..0b805e01d0ca 100644
--- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
+++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -400,7 +400,7 @@ def TTNG_TMEMStoreOp : TTNG_Op<"tmem_store", [MemoryEffects<[MemWrite]>]> {
   let hasVerifier = 1;
 }
 
-def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [MemoryEffects<[MemWrite]>]> {
+def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "allocate tensor memory";
   let description = [{
     This operation allocates buffer in tensor memory and return a descriptor
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
index 692c8c43a84f..57d26a90f417 100644
--- a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
+++ b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -388,6 +388,25 @@ LogicalResult TMEMAllocOp::verify() {
   return success();
 }
 
+// TMEMAllocOp
+void TMEMAllocOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  Operation *op = getOperation();
+  // If allocation is immutable, mark it as no side effect allow things like
+  // CSE, DCE to work in early compiler passes.
+  // After the memory offset is computed, we attach the true side effect to the
+  // op.
+  if (!getType().getMutableMemory() && !op->hasAttr("tensor_memory_col_offset"))
+    return;
+  effects.emplace_back(MemoryEffects::Allocate::get(),
+                       mlir::triton::nvidia_gpu::TensorMemory::get());
+  if (getSrc())
+    effects.emplace_back(MemoryEffects::Write::get(),
+                         getOperation()->getOpResult(0),
+                         mlir::triton::nvidia_gpu::TensorMemory::get());
+}
+
 bool isDescendingOrder(triton::gpu::MemDescType type) {
   auto order = triton::gpu::getOrder(type);
   auto rank = type.getRank();
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp
index 04492fccfaa6..7a9642eae8f2 100644
--- a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp
+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp
@@ -65,7 +65,7 @@ struct TCGen5MMAScaleSharedToTmemConversion
         cast<SwizzledSharedEncodingAttr>(oldType.getEncoding());
     CTALayoutAttr CTALayout = getCTALayout(oldEncoding);
     ArrayRef<unsigned> CTASplitNum = CTALayout.getCTASplitNum();
-    ArrayRef<int64_t> shape = oldType.getAllocShape();
+    ArrayRef<int64_t> shape = oldType.getShape();
     Attribute scaleEncoding = TensorMemoryScalesEncodingAttr::get(
         context, CTASplitNum[0], CTASplitNum[1]);
     Type scaleAType =