diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp similarity index 74% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp index f6a7fdf97..8bde4cc5d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp @@ -7,25 +7,20 @@ #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/Transforms/Transforms.h" -#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/SCF/Transforms/Transforms.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/Iterators.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h" -#define DEBUG_TYPE "iree-amdaie-split-buffers" +#define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos" namespace mlir::iree_compiler::AMDAIE { namespace { -class AMDAIESplitBuffersPass - : public impl::AMDAIESplitBuffersBase { +class AMDAIESplitLogicalObjectFifosPass + : public impl::AMDAIESplitLogicalObjectFifosBase< + AMDAIESplitLogicalObjectFifosPass> { public: - using AMDAIESplitBuffersBase::AMDAIESplitBuffersBase; + using AMDAIESplitLogicalObjectFifosBase::AMDAIESplitLogicalObjectFifosBase; void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); @@ -33,9 +28,10 @@ class AMDAIESplitBuffersPass void runOnOperation() override; }; -void AMDAIESplitBuffersPass::runOnOperation() { +void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { ModuleOp moduleOp = getOperation(); - IRRewriter rewriter(moduleOp.getContext()); + MLIRContext *context = &getContext(); + IRRewriter rewriter(context); SmallVector l2ToL1DmaOps; // We are currently walking through CoreOps gathering 3rd Input DmaOp (if @@ -60,7 +56,7 @@ void AMDAIESplitBuffersPass::runOnOperation() { l2ToL1DmaOp.getTargetObjectFifo(); Value targetAllocOp = targetObjectFifo.getMemref(); - // Now we'll create a narrowed L2 buffer. + // Now we'll create a narrowed linearized L2 buffer. rewriter.setInsertionPoint(sourceAllocOp); auto oldSourceMemRefType = cast(sourceAllocOp.getType()); auto targetMemRefType = cast(targetAllocOp.getType()); @@ -93,11 +89,25 @@ void AMDAIESplitBuffersPass::runOnOperation() { rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), newAllocOp.getResult(), sourceObjectFifo.getTiles()); - // Create new L3 -> L2 Dma Op. + // Create new L3 -> L2 Dma Op. Since the narrowed L2 buffer is linearized, + // we need to form offset/size/stride corresponding to the linearized + // buffer. + SmallVector staticOffsets( + 4, getAsIndexOpFoldResult(context, 0)); + SmallVector staticSizes( + 4, getAsIndexOpFoldResult(context, 1)); + SmallVector staticStrides( + 4, getAsIndexOpFoldResult(context, 0)); + OpFoldResult linearizedShape = + getAsIndexOpFoldResult(context, newAllocType.getNumElements()); + staticSizes[staticSizes.size() - 1] = linearizedShape; + staticStrides[staticStrides.size() - 1] = + getAsIndexOpFoldResult(context, 1); + staticStrides[staticStrides.size() - 2] = linearizedShape; rewriter.setInsertionPoint(l3ToL2DmaOp); rewriter.create( - l3ToL2DmaOp.getLoc(), source, l3ToL2DmaOp.getTargetMixedOffsets(), - l3ToL2DmaOp.getTargetMixedSizes(), l3ToL2DmaOp.getTargetMixedStrides(), + l3ToL2DmaOp.getLoc(), source, llvm::ArrayRef(staticOffsets), + llvm::ArrayRef(staticSizes), llvm::ArrayRef(staticStrides), l3ToL2DmaOp.getSource(), l3ToL2DmaOp.getSourceMixedOffsets(), l3ToL2DmaOp.getSourceMixedSizes(), l3ToL2DmaOp.getSourceMixedStrides()); @@ -107,8 +117,8 @@ void AMDAIESplitBuffersPass::runOnOperation() { l2ToL1DmaOp.getLoc(), l2ToL1DmaOp.getTarget(), l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(), l2ToL1DmaOp.getTargetMixedStrides(), source, - l2ToL1DmaOp.getSourceMixedOffsets(), l2ToL1DmaOp.getSourceMixedSizes(), - l2ToL1DmaOp.getSourceMixedStrides()); + llvm::ArrayRef(staticOffsets), llvm::ArrayRef(staticSizes), + llvm::ArrayRef(staticStrides)); rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp); // We have to discard non-zero offsets as subview has been replaced by a // dedicated allocated memref. @@ -138,8 +148,8 @@ void AMDAIESplitBuffersPass::runOnOperation() { } // namespace -std::unique_ptr createAMDAIESplitBuffersPass() { - return std::make_unique(); +std::unique_ptr createAMDAIESplitLogicalObjectFifosPass() { + return std::make_unique(); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index c6edd665b..1f8314142 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -82,7 +82,7 @@ iree_cc_library( "AMDAIEPad.cpp" "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" - "AMDAIESplitBuffers.cpp" + "AMDAIESplitLogicalObjectFifos.cpp" "AMDAIETile.cpp" "AMDAIETileAndFuse.cpp" "AMDAIEUtils.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 5be6ddf82..5e2e8950d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -62,7 +62,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPAD #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT -#define GEN_PASS_DEF_AMDAIESPLITBUFFERS +#define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJECTFIFOS #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE #define GEN_PASS_DEF_AMDAIEVECTORIZATION diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 38615a80a..505d04e81 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -578,7 +578,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIEDistributeCoresAndObjectFifosPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createAMDAIESplitBuffersPass()); + passManager.addPass(createAMDAIESplitLogicalObjectFifosPass()); passManager.addPass(createAMDAIEDmaToCircularDmaPass()); passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index b2b419f05..2530bc814 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -198,7 +198,7 @@ std::unique_ptr createAMDAIEPeelForLoopPass( AMDAIEPeelForLoopOptions options = {}); /// Create a pass to split buffers. -std::unique_ptr createAMDAIESplitBuffersPass(); +std::unique_ptr createAMDAIESplitLogicalObjectFifosPass(); /// Create pass to tile TilingInterface operations. std::unique_ptr createAMDAIETilePass(AMDAIETileOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 2fc72fb2e..60dad7f9c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -402,10 +402,10 @@ def AMDAIEPropagateDataLayout : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPropagateDataLayoutPass()"; } -def AMDAIESplitBuffers : - Pass<"iree-amdaie-split-buffers", "ModuleOp"> { +def AMDAIESplitLogicalObjectFifos : + Pass<"iree-amdaie-split-logical-objectfifos", "ModuleOp"> { let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations."; - let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitBuffersPass()"; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjectFifosPass()"; } def AMDAIETile : diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 4909174e6..954d85509 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -54,7 +54,7 @@ iree_lit_test_suite( "pad.mlir" "peel_for_loop.mlir" "propagate_data_layout.mlir" - "split_buffers.mlir" + "split_logical_objectfifos.mlir" "tile_and_fuse_using_scf_for.mlir" "tile_and_fuse_using_scf_forall.mlir" "tile_copy_using_scf_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir similarity index 64% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir index 9be8c2177..fbd4b697b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-buffers,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s // Glossary: // candidate core op : they are those amdaie.core ops which have at least three input dma ops. @@ -16,59 +16,30 @@ module { func.func @split_l2_buffer_no_candidate_core_op(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { %c3 = arith.constant 3 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c2048 = arith.constant 2048 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %c4096 = arith.constant 4096 : index - %c32 = arith.constant 32 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> - %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> %alloc_2 = memref.alloc() : memref<128x128xi32> - %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> %tile = amdaie.tile(%c1, %c3) %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> scf.forall (%arg4, %arg5) in (2, 2) { %2 = affine.apply #map(%arg5) %3 = affine.apply #map(%arg4) - %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %tile_4 = amdaie.tile(%c1, %c3) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %12 = amdaie.core(%tile_4, in : [%7, %8], out : [%11]) { + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_4, in : [%7], out : []) { %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - %14 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> - %15 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%13, %14 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%15 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_5: i32, %out: i32): - %18 = arith.muli %in, %in_5 : i32 - %19 = arith.addi %out, %18 : i32 - linalg.yield %19 : i32 - } - %16 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - %17 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%15, %16 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%17 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_5: i32, %out: i32): - %18 = arith.addi %in, %in_5 : i32 - linalg.yield %18 : i32 - } + linalg.fill ins(%c0_i32 : i32) outs(%13 : memref<1x1x4x8x4x8xi32, 2 : i32>) amdaie.end } } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> - memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> - memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> memref.dealloc %alloc_2 : memref<128x128xi32> return @@ -91,15 +62,16 @@ module { // CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE]]} : // CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall -// CHECK: %[[DMA_CPY_ND_L3_TO_L2:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO]] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]] // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.dma_cpy_nd // CHECK: amdaie.dma_cpy_nd // CHECK: %[[L1_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]] // CHECK: %[[DMA_CPY_ND_L2_TO_L1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO]] -// CHECK-SAME: %[[L2_OBJECTFIFO]] +// CHECK-SAME: %[[L2_OBJECTFIFO]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] // CHECK: amdaie.core(%[[TILE]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1]]], out : // CHECK: linalg.generic // CHECK: } @@ -112,15 +84,6 @@ module { module { func.func @split_l2_buffer_one_candidate_core_op(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { %c3 = arith.constant 3 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c2048 = arith.constant 2048 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %c4096 = arith.constant 4096 : index - %c32 = arith.constant 32 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index @@ -135,15 +98,15 @@ module { scf.forall (%arg4, %arg5) in (2, 2) { %2 = affine.apply #map(%arg5) %3 = affine.apply #map(%arg4) - %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %tile_4 = amdaie.tile(%c1, %c3) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %10 = amdaie.dma_cpy_nd(%9[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %12 = amdaie.core(%tile_4, in : [%7, %8, %10], out : [%11]) { %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> %14 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> @@ -195,23 +158,25 @@ module { // CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : // CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall -// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_0]] -// CHECK-SAME: %[[L3_OBJECTFIFO]] -// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_1]] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]] // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.dma_cpy_nd // CHECK: amdaie.dma_cpy_nd // CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_0]] -// CHECK-SAME: %[[L2_OBJECTFIFO_0]] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] // CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : // CHECK: linalg.generic // CHECK: } // CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_1]] -// CHECK-SAME: %[[L2_OBJECTFIFO_1]] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] // CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : // CHECK: linalg.generic // CHECK: } @@ -225,15 +190,6 @@ module { module { func.func @split_l2_buffer_two_core_ops(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { %c3 = arith.constant 3 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c2048 = arith.constant 2048 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %c4096 = arith.constant 4096 : index - %c32 = arith.constant 32 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index @@ -249,15 +205,15 @@ module { scf.forall (%arg4, %arg5) in (2, 2) { %2 = affine.apply #map(%arg5) %3 = affine.apply #map(%arg4) - %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %tile_5 = amdaie.tile(%c1, %c3) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %10 = amdaie.dma_cpy_nd(%9[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> @@ -278,7 +234,7 @@ module { amdaie.end } %13 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %14 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %14 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %15 = amdaie.core(%tile_4, in : [%7, %8, %14], out : [%11]) { %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> %17 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> @@ -334,32 +290,34 @@ module { // CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : // CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall -// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_0]] -// CHECK-SAME: %[[L3_OBJECTFIFO]] -// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO_1]] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]] // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.dma_cpy_nd // CHECK: amdaie.dma_cpy_nd -// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}], out : -// CHECK: linalg.generic +// CHECK: amdaie.core(%[[TILE_0]] +// CHECK: linalg.fill // CHECK: amdaie.end // CHECK: } // CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_0]] -// CHECK-SAME: %[[L2_OBJECTFIFO_0]] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] // CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : // CHECK: linalg.generic // CHECK: amdaie.end // CHECK: } -// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}], out : -// CHECK: linalg.generic +// CHECK: amdaie.core(%[[TILE_2]] +// CHECK: linalg.fill // CHECK: amdaie.end // CHECK: } // CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_1]] -// CHECK-SAME: %[[L2_OBJECTFIFO_1]] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] // CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : // CHECK: linalg.generic // CHECK: amdaie.end @@ -376,18 +334,10 @@ module { module { func.func @split_l2_buffer_mixed_core_ops(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { %c3 = arith.constant 3 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c2048 = arith.constant 2048 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %c4096 = arith.constant 4096 : index - %c32 = arith.constant 32 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> @@ -402,34 +352,20 @@ module { scf.forall (%arg4, %arg5) in (2, 2) { %2 = affine.apply #map(%arg5) %3 = affine.apply #map(%arg4) - %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %tile_7 = amdaie.tile(%c1, %c3) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %9 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %10 = amdaie.core(%tile_7, in : [%7, %8], out : [%9]) { - %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> - %20 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%18, %19 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.muli %in, %in_8 : i32 - %24 = arith.addi %out, %23 : i32 - linalg.yield %24 : i32 - } - %21 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - %22 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%20, %21 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%22 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.addi %in, %in_8 : i32 - linalg.yield %23 : i32 - } + %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.core(%tile_7, in : [%7], out : []) { + %11 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%11 : memref<1x1x4x8x4x8xi32, 2 : i32>) amdaie.end } %11 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.dma_cpy_nd(%11[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %13 = amdaie.core(%tile_4, in : [%7, %8, %12], out : [%9]) { %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> @@ -449,27 +385,13 @@ module { } amdaie.end } - %14 = amdaie.core(%tile_5, in : [%7, %8], out : [%9]) { - %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> - %20 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%18, %19 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.muli %in, %in_8 : i32 - %24 = arith.addi %out, %23 : i32 - linalg.yield %24 : i32 - } - %21 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - %22 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%20, %21 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%22 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.addi %in, %in_8 : i32 - linalg.yield %23 : i32 - } + %14 = amdaie.core(%tile_5, in : [%7], out : []) { + %15 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%15 : memref<1x1x4x8x4x8xi32, 2 : i32>) amdaie.end } %15 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %16 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %16 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %17 = amdaie.core(%tile_6, in : [%7, %8, %16], out : [%9]) { %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32>