From 8208c8c1cd22b8f49de5e9d5f46ff50a71da30f6 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 12 Aug 2024 10:49:21 +0000
Subject: [PATCH] Review comment address including offset/size/stride of
 linearized buffer

---
 ....cpp => AMDAIESplitLogicalObjectFifos.cpp} |  50 +++--
 .../iree-amd-aie/Transforms/CMakeLists.txt    |   2 +-
 .../iree-amd-aie/Transforms/PassDetail.h      |   2 +-
 .../iree-amd-aie/Transforms/Passes.cpp        |   2 +-
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.h  |   2 +-
 .../AMD-AIE/iree-amd-aie/Transforms/Passes.td |   6 +-
 .../Transforms/test/CMakeLists.txt            |   2 +-
 ...rs.mlir => split_logical_objectfifos.mlir} | 186 +++++-------------
 8 files changed, 92 insertions(+), 160 deletions(-)
 rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/{AMDAIESplitBuffers.cpp => AMDAIESplitLogicalObjectFifos.cpp} (74%)
 rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/{split_buffers.mlir => split_logical_objectfifos.mlir} (64%)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp
similarity index 74%
rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp
rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp
index f6a7fdf97..8bde4cc5d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp
@@ -7,25 +7,20 @@
 #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree-amd-aie/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 
-#define DEBUG_TYPE "iree-amdaie-split-buffers"
+#define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos"
 
 namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-class AMDAIESplitBuffersPass
-    : public impl::AMDAIESplitBuffersBase<AMDAIESplitBuffersPass> {
+class AMDAIESplitLogicalObjectFifosPass
+    : public impl::AMDAIESplitLogicalObjectFifosBase<
+          AMDAIESplitLogicalObjectFifosPass> {
  public:
-  using AMDAIESplitBuffersBase::AMDAIESplitBuffersBase;
+  using AMDAIESplitLogicalObjectFifosBase::AMDAIESplitLogicalObjectFifosBase;
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<AMDAIEDialect>();
@@ -33,9 +28,10 @@ class AMDAIESplitBuffersPass
   void runOnOperation() override;
 };
 
-void AMDAIESplitBuffersPass::runOnOperation() {
+void AMDAIESplitLogicalObjectFifosPass::runOnOperation() {
   ModuleOp moduleOp = getOperation();
-  IRRewriter rewriter(moduleOp.getContext());
+  MLIRContext *context = &getContext();
+  IRRewriter rewriter(context);
 
   SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
   // We are currently walking through CoreOps gathering 3rd Input DmaOp (if
@@ -60,7 +56,7 @@ void AMDAIESplitBuffersPass::runOnOperation() {
         l2ToL1DmaOp.getTargetObjectFifo();
     Value targetAllocOp = targetObjectFifo.getMemref();
 
-    // Now we'll create a narrowed L2 buffer.
+    // Now we'll create a narrowed linearized L2 buffer.
     rewriter.setInsertionPoint(sourceAllocOp);
     auto oldSourceMemRefType = cast<MemRefType>(sourceAllocOp.getType());
     auto targetMemRefType = cast<MemRefType>(targetAllocOp.getType());
@@ -93,11 +89,25 @@ void AMDAIESplitBuffersPass::runOnOperation() {
         rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
         newAllocOp.getResult(), sourceObjectFifo.getTiles());
 
-    // Create new L3 -> L2 Dma Op.
+    // Create new L3 -> L2 Dma Op. Since the narrowed L2 buffer is linearized,
+    // we need to form offset/size/stride corresponding to the linearized
+    // buffer.
+    SmallVector<OpFoldResult, 4> staticOffsets(
+        4, getAsIndexOpFoldResult(context, 0));
+    SmallVector<OpFoldResult, 4> staticSizes(
+        4, getAsIndexOpFoldResult(context, 1));
+    SmallVector<OpFoldResult, 4> staticStrides(
+        4, getAsIndexOpFoldResult(context, 0));
+    OpFoldResult linearizedShape =
+        getAsIndexOpFoldResult(context, newAllocType.getNumElements());
+    staticSizes[staticSizes.size() - 1] = linearizedShape;
+    staticStrides[staticStrides.size() - 1] =
+        getAsIndexOpFoldResult(context, 1);
+    staticStrides[staticStrides.size() - 2] = linearizedShape;
     rewriter.setInsertionPoint(l3ToL2DmaOp);
     rewriter.create<AMDAIE::DmaCpyNdOp>(
-        l3ToL2DmaOp.getLoc(), source, l3ToL2DmaOp.getTargetMixedOffsets(),
-        l3ToL2DmaOp.getTargetMixedSizes(), l3ToL2DmaOp.getTargetMixedStrides(),
+        l3ToL2DmaOp.getLoc(), source, llvm::ArrayRef(staticOffsets),
+        llvm::ArrayRef(staticSizes), llvm::ArrayRef(staticStrides),
         l3ToL2DmaOp.getSource(), l3ToL2DmaOp.getSourceMixedOffsets(),
         l3ToL2DmaOp.getSourceMixedSizes(), l3ToL2DmaOp.getSourceMixedStrides());
 
@@ -107,8 +117,8 @@ void AMDAIESplitBuffersPass::runOnOperation() {
         l2ToL1DmaOp.getLoc(), l2ToL1DmaOp.getTarget(),
         l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(),
         l2ToL1DmaOp.getTargetMixedStrides(), source,
-        l2ToL1DmaOp.getSourceMixedOffsets(), l2ToL1DmaOp.getSourceMixedSizes(),
-        l2ToL1DmaOp.getSourceMixedStrides());
+        llvm::ArrayRef(staticOffsets), llvm::ArrayRef(staticSizes),
+        llvm::ArrayRef(staticStrides));
     rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp);
     // We have to discard non-zero offsets as subview has been replaced by a
     // dedicated allocated memref.
@@ -138,8 +148,8 @@ void AMDAIESplitBuffersPass::runOnOperation() {
 
 }  // namespace
 
-std::unique_ptr<Pass> createAMDAIESplitBuffersPass() {
-  return std::make_unique<AMDAIESplitBuffersPass>();
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjectFifosPass() {
+  return std::make_unique<AMDAIESplitLogicalObjectFifosPass>();
 }
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index c6edd665b..1f8314142 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -82,7 +82,7 @@ iree_cc_library(
     "AMDAIEPad.cpp"
     "AMDAIEPeelForLoop.cpp"
     "AMDAIEPropagateDataLayout.cpp"
-    "AMDAIESplitBuffers.cpp"
+    "AMDAIESplitLogicalObjectFifos.cpp"
     "AMDAIETile.cpp"
     "AMDAIETileAndFuse.cpp"
     "AMDAIEUtils.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 5be6ddf82..5e2e8950d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -62,7 +62,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEPAD
 #define GEN_PASS_DEF_AMDAIEPEELFORLOOP
 #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT
-#define GEN_PASS_DEF_AMDAIESPLITBUFFERS
+#define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJECTFIFOS
 #define GEN_PASS_DEF_AMDAIETILE
 #define GEN_PASS_DEF_AMDAIETILEANDFUSE
 #define GEN_PASS_DEF_AMDAIEVECTORIZATION
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 38615a80a..505d04e81 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -578,7 +578,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
   passManager.addPass(createAMDAIEDistributeCoresAndObjectFifosPass());
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
-  passManager.addPass(createAMDAIESplitBuffersPass());
+  passManager.addPass(createAMDAIESplitLogicalObjectFifosPass());
 
   passManager.addPass(createAMDAIEDmaToCircularDmaPass());
   passManager.addNestedPass<func::FuncOp>(createAMDAIECreateAIEWorkgroupPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index b2b419f05..2530bc814 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -198,7 +198,7 @@ std::unique_ptr<Pass> createAMDAIEPeelForLoopPass(
     AMDAIEPeelForLoopOptions options = {});
 
 /// Create a pass to split buffers.
-std::unique_ptr<Pass> createAMDAIESplitBuffersPass();
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjectFifosPass();
 
 /// Create pass to tile TilingInterface operations.
 std::unique_ptr<Pass> createAMDAIETilePass(AMDAIETileOptions options = {});
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 2fc72fb2e..60dad7f9c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -402,10 +402,10 @@ def AMDAIEPropagateDataLayout :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPropagateDataLayoutPass()";
 }
 
-def AMDAIESplitBuffers :
-  Pass<"iree-amdaie-split-buffers", "ModuleOp"> {
+def AMDAIESplitLogicalObjectFifos :
+  Pass<"iree-amdaie-split-logical-objectfifos", "ModuleOp"> {
   let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations.";
-  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitBuffersPass()";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjectFifosPass()";
 }
 
 def AMDAIETile :
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 4909174e6..954d85509 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -54,7 +54,7 @@ iree_lit_test_suite(
     "pad.mlir"
     "peel_for_loop.mlir"
     "propagate_data_layout.mlir"
-    "split_buffers.mlir"
+    "split_logical_objectfifos.mlir"
     "tile_and_fuse_using_scf_for.mlir"
     "tile_and_fuse_using_scf_forall.mlir"
     "tile_copy_using_scf_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir
similarity index 64%
rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir
rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir
index 9be8c2177..fbd4b697b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-buffers,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // Glossary:
 // candidate core op : they are those amdaie.core ops which have at least three input dma ops.
@@ -16,59 +16,30 @@
 module {
   func.func @split_l2_buffer_no_candidate_core_op(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
     %c3 = arith.constant 3 : index
-    %c16 = arith.constant 16 : index
-    %c8 = arith.constant 8 : index
-    %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-    %c2048 = arith.constant 2048 : index
-    %c256 = arith.constant 256 : index
-    %c1024 = arith.constant 1024 : index
-    %c4096 = arith.constant 4096 : index
-    %c32 = arith.constant 32 : index
     %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
     %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
-    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
     %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
     %alloc_2 = memref.alloc() : memref<128x128xi32>
-    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
     %tile = amdaie.tile(%c1, %c3)
     %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
     scf.forall (%arg4, %arg5) in (2, 2) {
       %2 = affine.apply #map(%arg5)
       %3 = affine.apply #map(%arg4)
-      %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %tile_4 = amdaie.tile(%c1, %c3)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
-      %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
-      %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
-      %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %12 = amdaie.core(%tile_4, in : [%7, %8], out : [%11]) {
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %12 = amdaie.core(%tile_4, in : [%7], out : []) {
         %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
-        %14 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
-        %15 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%13, %14 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%15 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
-        ^bb0(%in: i32, %in_5: i32, %out: i32):
-          %18 = arith.muli %in, %in_5 : i32
-          %19 = arith.addi %out, %18 : i32
-          linalg.yield %19 : i32
-        }
-        %16 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        %17 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%15, %16 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%17 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
-        ^bb0(%in: i32, %in_5: i32, %out: i32):
-          %18 = arith.addi %in, %in_5 : i32
-          linalg.yield %18 : i32
-        }
+        linalg.fill ins(%c0_i32 : i32) outs(%13 : memref<1x1x4x8x4x8xi32, 2 : i32>)
         amdaie.end
       }
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32>
-    memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32>
-    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc_2 : memref<128x128xi32>
     return
@@ -91,15 +62,16 @@ module {
 //       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE]]} :
 //  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
 //       CHECK:   scf.forall
-//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO]]
-//  CHECK-SAME:                                                       %[[L3_OBJECTFIFO]]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]]
 //       CHECK:       amdaie.logicalobjectfifo.from_memref
 //       CHECK:       amdaie.logicalobjectfifo.from_memref
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       %[[L1_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]]
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO]]
-//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO]]
+//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
 //       CHECK:       amdaie.core(%[[TILE]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:       }
@@ -112,15 +84,6 @@ module {
 module {
   func.func @split_l2_buffer_one_candidate_core_op(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
     %c3 = arith.constant 3 : index
-    %c16 = arith.constant 16 : index
-    %c8 = arith.constant 8 : index
-    %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-    %c2048 = arith.constant 2048 : index
-    %c256 = arith.constant 256 : index
-    %c1024 = arith.constant 1024 : index
-    %c4096 = arith.constant 4096 : index
-    %c32 = arith.constant 32 : index
     %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
@@ -135,15 +98,15 @@ module {
     scf.forall (%arg4, %arg5) in (2, 2) {
       %2 = affine.apply #map(%arg5)
       %3 = affine.apply #map(%arg4)
-      %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %tile_4 = amdaie.tile(%c1, %c3)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
       %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
-      %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
       %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %10 = amdaie.dma_cpy_nd(%9[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
-      %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
       %12 = amdaie.core(%tile_4, in : [%7, %8, %10], out : [%11]) {
         %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
         %14 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
@@ -195,23 +158,25 @@ module {
 //       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
 //  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
 //       CHECK:   scf.forall
-//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_0]]
-//  CHECK-SAME:                                                       %[[L3_OBJECTFIFO]]
-//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_1]]
-//  CHECK-SAME:                                                       %[[L3_OBJECTFIFO]]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]]
 //       CHECK:       amdaie.logicalobjectfifo.from_memref
 //       CHECK:       amdaie.logicalobjectfifo.from_memref
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_0]]
-//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_0]]
+//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
 //       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_1]]
-//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_1]]
+//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
 //       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:       }
@@ -225,15 +190,6 @@ module {
 module {
   func.func @split_l2_buffer_two_core_ops(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
     %c3 = arith.constant 3 : index
-    %c16 = arith.constant 16 : index
-    %c8 = arith.constant 8 : index
-    %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-    %c2048 = arith.constant 2048 : index
-    %c256 = arith.constant 256 : index
-    %c1024 = arith.constant 1024 : index
-    %c4096 = arith.constant 4096 : index
-    %c32 = arith.constant 32 : index
     %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
@@ -249,15 +205,15 @@ module {
     scf.forall (%arg4, %arg5) in (2, 2) {
       %2 = affine.apply #map(%arg5)
       %3 = affine.apply #map(%arg4)
-      %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %tile_5 = amdaie.tile(%c1, %c3)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
       %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
-      %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
       %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %10 = amdaie.dma_cpy_nd(%9[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
-      %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
       %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) {
         %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
         %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
@@ -278,7 +234,7 @@ module {
         amdaie.end
       }
       %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %14 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
       %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) {
         %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
         %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
@@ -334,32 +290,34 @@ module {
 //       CHECK:   %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} :
 //  CHECK-SAME:         memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
 //       CHECK:   scf.forall
-//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_0]]
-//  CHECK-SAME:                                                       %[[L3_OBJECTFIFO]]
-//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_1]]
-//  CHECK-SAME:                                                       %[[L3_OBJECTFIFO]]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]]
+//       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
+//  CHECK-SAME:                                         %[[L3_OBJECTFIFO]]
 //       CHECK:       amdaie.logicalobjectfifo.from_memref
 //       CHECK:       amdaie.logicalobjectfifo.from_memref
 //       CHECK:       amdaie.dma_cpy_nd
 //       CHECK:       amdaie.dma_cpy_nd
-//       CHECK:       amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}], out :
-//       CHECK:         linalg.generic
+//       CHECK:       amdaie.core(%[[TILE_0]]
+//       CHECK:         linalg.fill
 //       CHECK:         amdaie.end
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_0]]
-//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_0]]
+//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
 //       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:         amdaie.end
 //       CHECK:       }
-//       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}], out :
-//       CHECK:         linalg.generic
+//       CHECK:       amdaie.core(%[[TILE_2]]
+//       CHECK:         linalg.fill
 //       CHECK:         amdaie.end
 //       CHECK:       }
 //       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_1]]
-//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_1]]
+//  CHECK-SAME:                                                       %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1]
 //       CHECK:       amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:         amdaie.end
@@ -376,18 +334,10 @@ module {
 module {
   func.func @split_l2_buffer_mixed_core_ops(%arg0: !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, %arg2: !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, %arg3: !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>) {
     %c3 = arith.constant 3 : index
-    %c16 = arith.constant 16 : index
-    %c8 = arith.constant 8 : index
-    %c4 = arith.constant 4 : index
-    %c128 = arith.constant 128 : index
-    %c2048 = arith.constant 2048 : index
-    %c256 = arith.constant 256 : index
-    %c1024 = arith.constant 1024 : index
-    %c4096 = arith.constant 4096 : index
-    %c32 = arith.constant 32 : index
     %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
     %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
     %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
     %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
@@ -402,34 +352,20 @@ module {
     scf.forall (%arg4, %arg5) in (2, 2) {
       %2 = affine.apply #map(%arg5)
       %3 = affine.apply #map(%arg4)
-      %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %tile_7 = amdaie.tile(%c1, %c3)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
       %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
-      %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
-      %9 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %10 = amdaie.core(%tile_7, in : [%7, %8], out : [%9]) {
-        %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
-        %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
-        %20 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%18, %19 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
-        ^bb0(%in: i32, %in_8: i32, %out: i32):
-          %23 = arith.muli %in, %in_8 : i32
-          %24 = arith.addi %out, %23 : i32
-          linalg.yield %24 : i32
-        }
-        %21 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        %22 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%20, %21 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%22 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
-        ^bb0(%in: i32, %in_8: i32, %out: i32):
-          %23 = arith.addi %in, %in_8 : i32
-          linalg.yield %23 : i32
-        }
+      %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %10 = amdaie.core(%tile_7, in : [%7], out : []) {
+        %11 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%11 : memref<1x1x4x8x4x8xi32, 2 : i32>)
         amdaie.end
       }
       %11 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %12 = amdaie.dma_cpy_nd(%11[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
       %13 = amdaie.core(%tile_4, in : [%7, %8, %12], out : [%9]) {
         %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
         %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
@@ -449,27 +385,13 @@ module {
         }
         amdaie.end
       }
-      %14 = amdaie.core(%tile_5, in : [%7, %8], out : [%9]) {
-        %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
-        %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>
-        %20 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%18, %19 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
-        ^bb0(%in: i32, %in_8: i32, %out: i32):
-          %23 = arith.muli %in, %in_8 : i32
-          %24 = arith.addi %out, %23 : i32
-          linalg.yield %24 : i32
-        }
-        %21 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        %22 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>> -> memref<1x1x8x8x4x4xi32, 2 : i32>
-        linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%20, %21 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%22 : memref<1x1x8x8x4x4xi32, 2 : i32>) {
-        ^bb0(%in: i32, %in_8: i32, %out: i32):
-          %23 = arith.addi %in, %in_8 : i32
-          linalg.yield %23 : i32
-        }
+      %14 = amdaie.core(%tile_5, in : [%7], out : []) {
+        %15 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
+        linalg.fill ins(%c0_i32 : i32) outs(%15 : memref<1x1x4x8x4x8xi32, 2 : i32>)
         amdaie.end
       }
       %15 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %16 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %16 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
       %17 = amdaie.core(%tile_6, in : [%7, %8, %16], out : [%9]) {
         %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>> -> memref<1x1x4x8x4x8xi32, 2 : i32>
         %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>> -> memref<1x1x8x4x8x4xi32, 2 : i32>