From 671fadcac1902b3f85c1ca6bd913cc56f7fefd30 Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 19 Sep 2024 11:55:18 -0700 Subject: [PATCH] Lowering of linalg.copy to dmas (via iree.linalg_ext.pack) (#780) Add support for lowering linalg.copy, unify with logic for lowering linalg.pack and linalg.unpack --- ...IEPackToDma.cpp => AMDAIEConvertToDma.cpp} | 70 ++++++++++++++++--- .../iree-amd-aie/Transforms/CMakeLists.txt | 2 +- .../iree-amd-aie/Transforms/PassDetail.h | 2 +- .../iree-amd-aie/Transforms/Passes.cpp | 2 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 4 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 41 +++++++++-- .../Transforms/test/CMakeLists.txt | 4 +- .../{pack_to_dma.mlir => convert_to_dma.mlir} | 57 ++++++++++++++- ...ures.mlir => convert_to_dma_failures.mlir} | 2 +- .../samples/matmul_pack_peel_objectfifo.mlir | 2 +- 10 files changed, 163 insertions(+), 23 deletions(-) rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/{AMDAIEPackToDma.cpp => AMDAIEConvertToDma.cpp} (83%) rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/{pack_to_dma.mlir => convert_to_dma.mlir} (78%) rename compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/{pack_to_dma_failures.mlir => convert_to_dma_failures.mlir} (94%) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackToDma.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp similarity index 83% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackToDma.cpp rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp index e30835019..eea68e548 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackToDma.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp @@ -10,13 +10,14 @@ #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h" #include "llvm/Support/FormatVariadic.h" -#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Pass/Pass.h" -#define DEBUG_TYPE "iree-amdaie-pack-to-dma" +#define DEBUG_TYPE "iree-amdaie-convert-to-dma" + + namespace mlir::iree_compiler::AMDAIE { @@ -302,25 +303,76 @@ LogicalResult rewriteAsDma(PackOrUnpackOp op, IRRewriter &rewriter) { return rewriteAsDma(rewriter, op, input, output, innerTiles); } +/// Convert a linalg.copy operation on 2 memrefs to an equivalent pack/unpack +/// operation. If the linalg.copy operation is to a memory closer to the +/// core it is converted to a pack operation, otherwise an unpack operation. +/// +/// Note: we could convert all copies to packs, but it would be potentially +/// confusing to have packs ops moving data away from cores. +LogicalResult copyToPack(IRRewriter &rewriter, linalg::CopyOp copyOp) { + if (copyOp.getNumOperands() != 2 || copyOp.getNumResults() != 0) { + copyOp.emitOpError() + << "has " << copyOp.getNumOperands() << " operands and " + << copyOp.getNumResults() + << " results. It must have 2 operands and 0 results to convert " + "to an iree.linalg_ext dialect pack/unpack operation"; + return failure(); + } + // Setting up the 'identity' pack/unpack: + ArrayRef innerDimsPos{}; + ArrayRef innerTiles{}; + + Value src = copyOp.getOperand(0); + Value dst = copyOp.getOperand(1); + + // MemRefTypes with no memory space attribute return 0 here, so this is safe. + uint32_t srcMemspace = cast(src.getType()).getMemorySpaceAsInt(); + uint32_t dstMemspace = cast(dst.getType()).getMemorySpaceAsInt(); + const bool towardsCore = srcMemspace <= dstMemspace; + + rewriter.setInsertionPoint(copyOp); + if (towardsCore) { + rewriter.replaceOpWithNewOp( + copyOp, src, dst, innerDimsPos, innerTiles); + } else { + rewriter.replaceOpWithNewOp( + copyOp, src, dst, innerDimsPos, innerTiles); + } + + return success(); +} + }; // namespace -class AMDAIEPackToDmaPass - : public impl::AMDAIEPackToDmaBase { +class AMDAIEConvertToDmaPass + : public impl::AMDAIEConvertToDmaBase { public: void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); } - AMDAIEPackToDmaPass() = default; - AMDAIEPackToDmaPass(const AMDAIEPackToDmaPass &pass){}; + AMDAIEConvertToDmaPass() = default; + AMDAIEConvertToDmaPass(const AMDAIEConvertToDmaPass &pass){}; void runOnOperation() override; }; -void AMDAIEPackToDmaPass::runOnOperation() { +void AMDAIEConvertToDmaPass::runOnOperation() { MLIRContext *context = &getContext(); IRRewriter rewriter(context); + // Convert all linalg.copy to iree_linalg_ext.pack/unpack ops. We then + // bootstrap the work done for lowering the pack/unpack op to dmas as the next + // step. This is easy to implement, but not the most direct lowering, so + // we might want to revisit this. + WalkResult convertCopiesWalkResult = + getOperation()->walk([&rewriter](linalg::CopyOp copyOp) { + if (failed(copyToPack(rewriter, copyOp))) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + if (convertCopiesWalkResult.wasInterrupted()) return signalPassFailure(); + auto walkResult = getOperation()->walk( [&rewriter](IREE::LinalgExt::PackOp op) -> WalkResult { if (failed(rewriteAsDma(op, rewriter))) { @@ -339,7 +391,7 @@ void AMDAIEPackToDmaPass::runOnOperation() { if (walkResult.wasInterrupted()) signalPassFailure(); } -std::unique_ptr createAMDAIEPackToDmaPass() { - return std::make_unique(); +std::unique_ptr createAMDAIEConvertToDmaPass() { + return std::make_unique(); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index c8c288835..79afc3250 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -54,6 +54,7 @@ iree_cc_library( "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" "AMDAIECombineStridedOps.cpp" + "AMDAIEConvertToDma.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEConvertCoreForallToFor.cpp" "AMDAIECreateAIEWorkgroup.cpp" @@ -85,7 +86,6 @@ iree_cc_library( "AMDAIENormalizeLoopBounds.cpp" "AMDAIEObjFifoBufferization.cpp" "AMDAIEPackAndTranspose.cpp" - "AMDAIEPackToDma.cpp" "AMDAIEPad.cpp" "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 06172300d..f1ce8b4fa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -64,7 +64,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIENORMALIZELOOPBOUNDS #define GEN_PASS_DEF_AMDAIEOBJFIFOBUFFERIZATION #define GEN_PASS_DEF_AMDAIEPACKANDTRANSPOSE -#define GEN_PASS_DEF_AMDAIEPACKTODMA +#define GEN_PASS_DEF_AMDAIECONVERTTODMA #define GEN_PASS_DEF_AMDAIEPAD #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 3579daa1e..7ce66d076 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -584,7 +584,7 @@ void buildAMDAIETransformPassPipeline(OpPassManager &variantPassManager, void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); - passManager.addPass(createAMDAIEPackToDmaPass()); + passManager.addPass(createAMDAIEConvertToDmaPass()); passManager.addPass(createAMDAIENormalizeLoopBoundsPass()); passManager.addPass(createAMDAIEInsertCoresPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 637e1cf83..d9bc0712d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -209,9 +209,9 @@ std::unique_ptr createAMDAIEObjFifoBufferizationPass(); std::unique_ptr createAMDAIEPackAndTransposePass( AMDAIEPackAndTransposeOptions options = {}); -/// Create pass to lower pack/unpack ops to AMDAIE DMA ops operating on +/// Create pass to lower copy/pack/unpack ops to AMDAIE DMA ops operating on /// logical objectFifos. -std::unique_ptr createAMDAIEPackToDmaPass(); +std::unique_ptr createAMDAIEConvertToDmaPass(); /// Create a pass to pad MatmulOp. std::unique_ptr createAMDAIEPadPass(AMDAIEPadOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 941857577..2bb171cc8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -411,10 +411,43 @@ def AMDAIEPackAndTranspose : ]; } -def AMDAIEPackToDma : - Pass<"iree-amdaie-pack-to-dma", ""> { - let summary = "Convert pack/unpack ops to AMDAIE DMA ops operating on logical objectFifos."; - let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPackToDmaPass()"; +def AMDAIEConvertToDma : + Pass<"iree-amdaie-convert-to-dma", ""> { + let summary = "Convert linalg ops to AMDAIE DMA ops on logical objectFifos."; + let description = [{ + Converts/lowers linalg.copy, iree_linalg_ext.pack, and iree_linalg_ext.unpack + ops to dma operations. For example, given allocactions, + + ```mlir + %src = memref.alloc() : memref<8x16xi32, 1> + %dst = memref.alloc() : memref<1x1x8x16xi32, 1> + ``` + + and a packing copy operation between them, + ```mlir + iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [8, 16] + into %dst : (memref<8x16xi32, 1> memref<1x1x8x16xi32, 1>) + ``` + + the pack gets lowered to, + ```mlir + %0 = amdaie.logicalobjectfifo.from_memref %dst, {} : memref<1x1x8x16xi32, 1> + -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %src, {} : memref<8x16xi32, 1> + -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], + %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : + (!amdaie.logicalobjectfifo>, + !amdaie.logicalobjectfifo>) + ``` + + The approach for converting linalg.copy operations is to first convert them + to identity iree_linalg_ext dialect pack/unpack operations, and then rely on + the lowering of pack/unpack to dma operations to do the heavy lifting. + } + + }]; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConvertToDmaPass()"; } def AMDAIEPad : diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index ebac90ebf..aacbd297a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -55,8 +55,8 @@ iree_lit_test_suite( "pack_and_transpose_level1.mlir" "pack_and_transpose_level2.mlir" "pack_to_air.mlir" - "pack_to_dma.mlir" - "pack_to_dma_failures.mlir" + "convert_to_dma.mlir" + "convert_to_dma_failures.mlir" "pad.mlir" "peel_for_loop.mlir" "propagate_data_layout.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_dma.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir similarity index 78% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_dma.mlir rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir index 02b714826..5ad8d0b63 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_dma.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --iree-amdaie-pack-to-dma --cse --split-input-file %s | FileCheck %s +// RUN: iree-opt --iree-amdaie-convert-to-dma --cse --split-input-file %s | FileCheck %s // CHECK-LABEL: @basic_unitdim_pack // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<1x1x8x16xi32, 1> @@ -132,6 +132,7 @@ func.func @permute_unpack() { } // ----- + // CHECK-LABEL: @subview_unpack // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<32x8x64xf32> // CHECK: %[[FROMMEMREF0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC0]], {} : memref<32x8x64xf32> -> !amdaie.logicalobjectfifo> @@ -157,3 +158,57 @@ func.func @subview_unpack() { } return } + +// ----- + +// CHECK-LABEL: @basic_copy +// CHECK: %[[SRC:.*]] = memref.alloc() : memref<8x16xi32, 1> +// CHECK: %[[FROMSRC:.*]] = amdaie.logicalobjectfifo.from_memref %[[SRC]], {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK: %[[DST:.*]] = memref.alloc() : memref<8x16xi32, 1> +// CHECK: %[[FROMDST:.*]] = amdaie.logicalobjectfifo.from_memref %[[DST]], {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK: %[[DMA0:.*]] = amdaie.dma_cpy_nd +// CHECK-SAME: %[[FROMDST]][0, 0] [8, 16] [16, 1] +// CHECK-SAME: %[[FROMSRC]][0, 0] [8, 16] [16, 1] +// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +func.func @basic_copy() { + %src = memref.alloc() : memref<8x16xi32, 1> + %dst = memref.alloc() : memref<8x16xi32, 1> + linalg.copy ins(%src : memref<8x16xi32, 1>) outs(%dst : memref<8x16xi32, 1>) + return +} + +// ----- + +// CHECK-LABEL: @copy_towards_core +// CHECK: %[[SRC:.*]] = memref.alloc() : memref<8xi32> +// CHECK: %[[FROMSRC:.*]] = amdaie.logicalobjectfifo.from_memref %[[SRC]], {} : memref<8xi32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[DST:.*]] = memref.alloc() : memref<8xi32, 1> +// CHECK: %[[FROMDST:.*]] = amdaie.logicalobjectfifo.from_memref %[[DST]], {} : memref<8xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK: %[[DMA0:.*]] = amdaie.dma_cpy_nd +// CHECK-SAME: %[[FROMDST]][0] [8] [1] +// CHECK-SAME: %[[FROMSRC]][0] [8] [1] +// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +func.func @copy_towards_core() { + %src = memref.alloc() : memref<8xi32> + %dst = memref.alloc() : memref<8xi32, 1> + linalg.copy ins(%src : memref<8xi32>) outs(%dst : memref<8xi32, 1>) + return +} + +// ----- + +// CHECK-LABEL: @copy_away_from_core +// CHECK: %[[SRC:.*]] = memref.alloc() : memref<8xi32, 2> +// CHECK: %[[FROMSRC:.*]] = amdaie.logicalobjectfifo.from_memref %[[SRC]], {} : memref<8xi32, 2> -> !amdaie.logicalobjectfifo> +// CHECK: %[[DST:.*]] = memref.alloc() : memref<8xi32, 1> +// CHECK: %[[FROMDST:.*]] = amdaie.logicalobjectfifo.from_memref %[[DST]], {} : memref<8xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK: %[[DMA0:.*]] = amdaie.dma_cpy_nd +// CHECK-SAME: %[[FROMDST]][0] [8] [1] +// CHECK-SAME: %[[FROMSRC]][0] [8] [1] +// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +func.func @copy_away_from_core() { + %src = memref.alloc() : memref<8xi32, 2> + %dst = memref.alloc() : memref<8xi32, 1> + linalg.copy ins(%src : memref<8xi32, 2>) outs(%dst : memref<8xi32, 1>) + return +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_dma_failures.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir similarity index 94% rename from compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_dma_failures.mlir rename to compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir index 2cb247df3..992ae4190 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_dma_failures.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma_failures.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt %s --iree-amdaie-pack-to-dma -verify-diagnostics +// RUN: iree-opt %s --iree-amdaie-convert-to-dma -verify-diagnostics #map = affine_map<()[s0] -> (s0 * 8)> diff --git a/tests/samples/matmul_pack_peel_objectfifo.mlir b/tests/samples/matmul_pack_peel_objectfifo.mlir index 58098c399..8c245dd68 100644 --- a/tests/samples/matmul_pack_peel_objectfifo.mlir +++ b/tests/samples/matmul_pack_peel_objectfifo.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,canonicalize,iree-amdaie-canonicalize-npu-dma-cpy-nd,canonicalize,iree-amdaie-sink-into-core,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,canonicalize,iree-amdaie-canonicalize-npu-dma-cpy-nd,canonicalize,iree-amdaie-sink-into-core,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s // CHECK: aie.device(npu1_4col) // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2)