From 3fe035c7e6dbe3f7d3610d5977a2a9037d52e365 Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Thu, 27 Feb 2025 11:58:56 -0800 Subject: [PATCH] [BACKEND] Dead code tmem_alloc that are not mutable (#6047) This sets the right side effects on tmem_alloc in order to have dead code eliminication and CSE kick off. --- .../TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td | 2 +- lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp | 19 +++++++++++++++++++ .../Transforms/MMALowering.cpp | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td index 20ddc4ccc621..0b805e01d0ca 100644 --- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td +++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td @@ -400,7 +400,7 @@ def TTNG_TMEMStoreOp : TTNG_Op<"tmem_store", [MemoryEffects<[MemWrite]>]> { let hasVerifier = 1; } -def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [MemoryEffects<[MemWrite]>]> { +def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods]> { let summary = "allocate tensor memory"; let description = [{ This operation allocates buffer in tensor memory and return a descriptor diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp index 692c8c43a84f..57d26a90f417 100644 --- a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp +++ b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp @@ -388,6 +388,25 @@ LogicalResult TMEMAllocOp::verify() { return success(); } +// TMEMAllocOp +void TMEMAllocOp::getEffects( + SmallVectorImpl> + &effects) { + Operation *op = getOperation(); + // If allocation is immutable, mark it as no side effect allow things like + // CSE, DCE to work in early compiler passes. + // After the memory offset is computed, we attach the true side effect to the + // op. + if (!getType().getMutableMemory() && !op->hasAttr("tensor_memory_col_offset")) + return; + effects.emplace_back(MemoryEffects::Allocate::get(), + mlir::triton::nvidia_gpu::TensorMemory::get()); + if (getSrc()) + effects.emplace_back(MemoryEffects::Write::get(), + getOperation()->getOpResult(0), + mlir::triton::nvidia_gpu::TensorMemory::get()); +} + bool isDescendingOrder(triton::gpu::MemDescType type) { auto order = triton::gpu::getOrder(type); auto rank = type.getRank(); diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp index 04492fccfaa6..7a9642eae8f2 100644 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp @@ -65,7 +65,7 @@ struct TCGen5MMAScaleSharedToTmemConversion cast(oldType.getEncoding()); CTALayoutAttr CTALayout = getCTALayout(oldEncoding); ArrayRef CTASplitNum = CTALayout.getCTASplitNum(); - ArrayRef shape = oldType.getAllocShape(); + ArrayRef shape = oldType.getShape(); Attribute scaleEncoding = TensorMemoryScalesEncodingAttr::get( context, CTASplitNum[0], CTASplitNum[1]); Type scaleAType =