From f8f31a8100af2f3c464abfc868c003afbc7f8dbb Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Tue, 27 Aug 2024 14:04:28 +0200 Subject: [PATCH] Add logical objFifo placeholder op for connection reuse (#709) Adds the `amdaie.logicalobjectfifo.placeholder` operation that represents a logical objectFifo to be filled in later. This enables reuse of connections/circular DMAs/physical AIE channels for different data packets, which helps with (fused) operations with more than 2 inputs. This is especially useful for reading from/writing to DDR. --- .../IR/AMDAIELogicalObjFifoOpInterface.cpp | 10 + .../IR/AMDAIELogicalObjFifoOpInterface.h | 16 + .../IR/AMDAIELogicalObjFifoOpInterface.td | 37 +++ .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp | 275 +++++++++++++++--- .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h | 1 + .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td | 98 +++++-- .../AMD-AIE/iree-amd-aie/IR/CMakeLists.txt | 15 + .../iree-amd-aie/IR/test/roundtrip.mlir | 55 ++++ .../Transforms/AMDAIEAssignNpuDmaBdIds.cpp | 41 ++- .../Transforms/AMDAIECreateAIEWorkgroup.cpp | 38 ++- .../AMDAIECreateLogicalObjectFifoLink.cpp | 12 +- .../Transforms/AMDAIELowerToAIE.cpp | 53 +++- .../test/assign_npu_dma_bd_ids.mlir | 128 ++++---- .../Transforms/test/create_aie_workgroup.mlir | 224 +++++++------- .../Transforms/test/lower_to_aie.mlir | 106 ++++--- 15 files changed, 782 insertions(+), 327 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp new file mode 100644 index 000000000..285fa77c3 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp @@ -0,0 +1,10 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h" + +/// Include the definitions of the logical-objFifo-like interfaces. +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp.inc" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h new file mode 100644 index 000000000..e34f129af --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h @@ -0,0 +1,16 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_COMPILER_AMDAIE_LOGICALOBJFIFOOPINTERFACE_H_ +#define IREE_COMPILER_AMDAIE_LOGICALOBJFIFOOPINTERFACE_H_ + +#include "mlir/IR/OpImplementation.h" + +// clang-format off +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h.inc" +// clang-format on + +#endif // IREE_COMPILER_AMDAIE_LOGICALOBJFIFOOPINTERFACE_H_ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td new file mode 100644 index 000000000..4fbfc0a91 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td @@ -0,0 +1,37 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMDAIE_DIALECT_LOGICALOBJFIFOOPINTERFACE +#define IREE_AMDAIE_DIALECT_LOGICALOBJFIFOOPINTERFACE + +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/CopyOpInterface.td" + +//===----------------------------------------------------------------------===// +// Defines the interface for logical objectFifo operations. +//===----------------------------------------------------------------------===// + +def LogicalObjFifoOpInterface : OpInterface<"LogicalObjFifoOpInterface"> { + let description = [{ + Interface for operations creating a logical objectFifo. + }]; + let cppNamespace = "mlir::iree_compiler::AMDAIE"; + + let methods = [ + InterfaceMethod< + /*desc=*/"Return the assigned tiles.", + /*retTy=*/"::mlir::OperandRange", + /*methodName=*/"getTiles", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return $_op.getTiles(); + }] + > + ]; +} + +#endif // IREE_AMDAIE_DIALECT_LOGICALOBJFIFOOPINTERFACE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index a6a65ac19..0c501fb3e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -522,16 +522,14 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_NpuDmaCpyNdOp //===----------------------------------------------------------------------===// -// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target -// and source BD IDs. -void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, - ArrayRef targetOffsets, - ArrayRef targetSizes, - ArrayRef targetStrides, - ArrayRef sourceOffsets, - ArrayRef sourceSizes, - ArrayRef sourceStrides, - mlir::Value targetBdId, mlir::Value sourceBdId) { +// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and +// source BD IDs. +void NpuDmaCpyNdOp::build( + OpBuilder &b, OperationState &result, Value dma, Value target, + ArrayRef targetOffsets, ArrayRef targetSizes, + ArrayRef targetStrides, Value targetBdId, Value source, + ArrayRef sourceOffsets, ArrayRef sourceSizes, + ArrayRef sourceStrides, Value sourceBdId) { SmallVector staticTargetOffsets, staticTargetSizes, staticTargetStrides; SmallVector staticSourceOffsets, staticSourceSizes, @@ -552,22 +550,21 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, staticSourceSizes); dispatchIndexOpFoldResults(sourceStrides, dynamicSourceStrides, staticSourceStrides); - build(b, result, b.getIndexType(), dma, dynamicTargetOffsets, + build(b, result, b.getIndexType(), dma, target, dynamicTargetOffsets, dynamicTargetSizes, dynamicTargetStrides, staticTargetOffsets, - staticTargetSizes, staticTargetStrides, dynamicSourceOffsets, - dynamicSourceSizes, dynamicSourceStrides, staticSourceOffsets, - staticSourceSizes, staticSourceStrides, targetBdId, sourceBdId); + staticTargetSizes, staticTargetStrides, targetBdId, source, + dynamicSourceOffsets, dynamicSourceSizes, dynamicSourceStrides, + staticSourceOffsets, staticSourceSizes, staticSourceStrides, + sourceBdId); } // Build a NpuDmaCpyNdOp with static entries. -void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, - ArrayRef targetOffsets, - ArrayRef targetSizes, - ArrayRef targetStrides, - ArrayRef sourceOffsets, - ArrayRef sourceSizes, - ArrayRef sourceStrides, - mlir::Value targetBdId, mlir::Value sourceBdId) { +void NpuDmaCpyNdOp::build( + OpBuilder &b, OperationState &result, Value dma, Value target, + ArrayRef targetOffsets, ArrayRef targetSizes, + ArrayRef targetStrides, mlir::Value targetBdId, Value source, + ArrayRef sourceOffsets, ArrayRef sourceSizes, + ArrayRef sourceStrides, mlir::Value sourceBdId) { SmallVector targetOffsetValues = llvm::to_vector<4>( llvm::map_range(targetOffsets, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); @@ -592,17 +589,18 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, llvm::map_range(sourceStrides, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); })); - build(b, result, dma, targetOffsetValues, targetSizeValues, - targetStrideValues, sourceOffsetValues, sourceSizeValues, - sourceStrideValues, targetBdId, sourceBdId); + build(b, result, dma, target, targetOffsetValues, targetSizeValues, + targetStrideValues, targetBdId, source, sourceOffsetValues, + sourceSizeValues, sourceStrideValues, sourceBdId); } // Build a NpuDmaCpyNdOp with dynamic entries. void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, - ValueRange targetOffsets, ValueRange targetSizes, - ValueRange targetStrides, ValueRange sourceOffsets, - ValueRange sourceSizes, ValueRange sourceStrides, - mlir::Value targetBdId, mlir::Value sourceBdId) { + Value target, ValueRange targetOffsets, + ValueRange targetSizes, ValueRange targetStrides, + mlir::Value targetBdId, Value source, + ValueRange sourceOffsets, ValueRange sourceSizes, + ValueRange sourceStrides, mlir::Value sourceBdId) { SmallVector targetOffsetValues = llvm::to_vector<4>(llvm::map_range( targetOffsets, [](Value v) -> OpFoldResult { return v; })); @@ -619,9 +617,212 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, SmallVector sourceStrideValues = llvm::to_vector<4>(llvm::map_range( sourceStrides, [](Value v) -> OpFoldResult { return v; })); - build(b, result, dma, targetOffsetValues, targetSizeValues, - targetStrideValues, sourceOffsetValues, sourceSizeValues, - sourceStrideValues, targetBdId, sourceBdId); + build(b, result, dma, target, targetOffsetValues, targetSizeValues, + targetStrideValues, targetBdId, source, sourceOffsetValues, + sourceSizeValues, sourceStrideValues, sourceBdId); +} + +void NpuDmaCpyNdOp::print(OpAsmPrinter &p) { + Operation *op = getOperation(); + p << " " << getDma() << "("; + if (getTarget()) p << getTarget(); + printDynamicIndexList(p, op, getTargetOffsets(), getTargetStaticOffsets()); + p << " "; + printDynamicIndexList(p, op, getTargetSizes(), getTargetStaticSizes()); + p << " "; + printDynamicIndexList(p, op, getTargetStrides(), getTargetStaticStrides()); + if (getTargetBdId()) p << " bd_id = " << getTargetBdId(); + p << ", "; + if (getSource()) p << getSource(); + printDynamicIndexList(p, op, getSourceOffsets(), getSourceStaticOffsets()); + p << " "; + printDynamicIndexList(p, op, getSourceSizes(), getSourceStaticSizes()); + p << " "; + printDynamicIndexList(p, op, getSourceStrides(), getSourceStaticStrides()); + if (getSourceBdId()) p << " bd_id = " << getSourceBdId(); + p << ")"; + SmallVector elidedAttrs; + elidedAttrs.push_back("operandSegmentSizes"); + elidedAttrs.push_back("target_static_offsets"); + elidedAttrs.push_back("target_static_sizes"); + elidedAttrs.push_back("target_static_strides"); + elidedAttrs.push_back("source_static_offsets"); + elidedAttrs.push_back("source_static_sizes"); + elidedAttrs.push_back("source_static_strides"); + p.printOptionalAttrDictWithKeyword(op->getAttrs(), elidedAttrs); + if (getTarget() || getSource()) p << " :"; + if (getTarget()) p << " target_type = " << getTarget().getType(); + if (getSource()) p << " source_type = " << getSource().getType(); +} + +ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) { + OpBuilder b(parser.getContext()); + auto indexType = b.getIndexType(); + + SMLoc targetOperandsLoc, sourceOperandsLoc; + OpAsmParser::UnresolvedOperand dma; + SmallVector targetOperands, sourceOperands, + targetBdIdOperands, sourceBdIdOperands; + DenseI64ArrayAttr targetStaticOffsets, targetStaticSizes, targetStaticStrides; + SmallVector targetDynamicOffsets, + targetDynamicSizes, targetDynamicStrides; + DenseI64ArrayAttr sourceStaticOffsets, sourceStaticSizes, sourceStaticStrides; + SmallVector sourceDynamicOffsets, + sourceDynamicSizes, sourceDynamicStrides; + SmallVector targetTypes; + SmallVector sourceTypes; + + if (failed(parser.parseOperand(dma)) || failed(parser.parseLParen())) + return failure(); + + OpAsmParser::UnresolvedOperand target; + if (parser.parseOptionalOperand(target).has_value()) { + targetOperands.push_back(target); + } + if (failed(parseDynamicIndexList(parser, targetDynamicOffsets, + targetStaticOffsets))) { + return failure(); + } + result.getOrAddProperties().target_static_offsets = + targetStaticOffsets; + if (failed(parseDynamicIndexList(parser, targetDynamicSizes, + targetStaticSizes))) { + return failure(); + } + result.getOrAddProperties().target_static_sizes = + targetStaticSizes; + if (failed(parseDynamicIndexList(parser, targetDynamicStrides, + targetStaticStrides))) { + return failure(); + } + result.getOrAddProperties().target_static_strides = + targetStaticStrides; + + if (succeeded(parser.parseOptionalKeyword("bd_id"))) { + if (failed(parser.parseEqual())) return failure(); + OpAsmParser::UnresolvedOperand bdId; + if (failed(parser.parseOperand(bdId))) return failure(); + targetBdIdOperands.push_back(bdId); + } + + if (failed(parser.parseComma())) return failure(); + + OpAsmParser::UnresolvedOperand source; + if (parser.parseOptionalOperand(source).has_value()) { + sourceOperands.push_back(source); + } + if (failed(parseDynamicIndexList(parser, sourceDynamicOffsets, + sourceStaticOffsets))) { + return failure(); + } + result.getOrAddProperties().source_static_offsets = + sourceStaticOffsets; + if (failed(parseDynamicIndexList(parser, sourceDynamicSizes, + sourceStaticSizes))) { + return failure(); + } + result.getOrAddProperties().source_static_sizes = + sourceStaticSizes; + if (failed(parseDynamicIndexList(parser, sourceDynamicStrides, + sourceStaticStrides))) { + return failure(); + } + result.getOrAddProperties().source_static_strides = + sourceStaticStrides; + + if (succeeded(parser.parseOptionalKeyword("bd_id"))) { + if (failed(parser.parseEqual())) return failure(); + OpAsmParser::UnresolvedOperand bdId; + if (failed(parser.parseOperand(bdId))) return failure(); + sourceBdIdOperands.push_back(bdId); + } + + if (failed(parser.parseRParen())) return failure(); + { + auto loc = parser.getCurrentLocation(); + if (parser.parseOptionalAttrDict(result.attributes)) return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) { + return failure(); + } + } + + if (succeeded(parser.parseOptionalColon())) { + if (succeeded(parser.parseOptionalKeyword("target_type"))) { + if (parser.parseEqual()) return failure(); + Type targetType; + if (failed(parser.parseType(targetType))) return failure(); + targetTypes.push_back(targetType); + } + if (succeeded(parser.parseOptionalKeyword("source_type"))) { + if (parser.parseEqual()) return failure(); + Type sourceType; + if (failed(parser.parseType(sourceType))) return failure(); + sourceTypes.push_back(sourceType); + } + } + + llvm::copy( + ArrayRef({1, static_cast(targetOperands.size()), + static_cast(targetDynamicOffsets.size()), + static_cast(targetDynamicSizes.size()), + static_cast(targetDynamicStrides.size()), + static_cast(targetBdIdOperands.size()), + static_cast(sourceOperands.size()), + static_cast(sourceDynamicOffsets.size()), + static_cast(sourceDynamicSizes.size()), + static_cast(sourceDynamicStrides.size()), + static_cast(sourceBdIdOperands.size())}), + result.getOrAddProperties() + .operandSegmentSizes.begin()); + + if (failed(parser.resolveOperand(dma, indexType, result.operands))) + return failure(); + if (failed(parser.resolveOperands(targetOperands, targetTypes, + targetOperandsLoc, result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetDynamicOffsets, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetDynamicSizes, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetDynamicStrides, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(targetBdIdOperands, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceOperands, sourceTypes, + sourceOperandsLoc, result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceDynamicOffsets, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceDynamicSizes, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceDynamicStrides, indexType, + result.operands))) { + return failure(); + } + if (failed(parser.resolveOperands(sourceBdIdOperands, indexType, + result.operands))) { + return failure(); + } + + result.addTypes(indexType); + return success(); } DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp( @@ -634,14 +835,15 @@ DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp( ::llvm::SmallVector &newSourceStrides) { Location loc = (*this)->getLoc(); auto newOp = rewriter.create( - loc, getDma(), + loc, getDma(), getTarget(), getValueOrCreateConstantIndexOp(rewriter, loc, newTargetOffsets), getValueOrCreateConstantIndexOp(rewriter, loc, newTargetSizes), getValueOrCreateConstantIndexOp(rewriter, loc, newTargetStrides), + getTargetBdId(), getSource(), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceOffsets), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceSizes), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceStrides), - getTargetBdId(), getSourceBdId()); + getSourceBdId()); return cast(newOp.getOperation()); } @@ -660,8 +862,9 @@ struct NpuDmaCpyNdOpReplacementBuilder { ArrayRef srcMixedSizes, ArrayRef srcMixedStrides) { rewriter.replaceOpWithNewOp( - dmaOp, dmaOp.getDma(), tgtMixedOffsets, tgtMixedSizes, tgtMixedStrides, - srcMixedOffsets, srcMixedSizes, srcMixedStrides, dmaOp.getTargetBdId(), + dmaOp, dmaOp.getDma(), dmaOp.getTarget(), tgtMixedOffsets, + tgtMixedSizes, tgtMixedStrides, dmaOp.getTargetBdId(), + dmaOp.getSource(), srcMixedOffsets, srcMixedSizes, srcMixedStrides, dmaOp.getSourceBdId()); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h index 7a292470f..b3cafdbf7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.h @@ -9,6 +9,7 @@ #include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h" +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h" #include "iree-amd-aie/IR/AMDAIETypes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 763d3a165..0f46400da 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -18,6 +18,7 @@ include "iree-amd-aie/IR/AMDAIEAttrs.td" include "iree-amd-aie/aie_runtime/AMDAIEEnums.td" include "iree-amd-aie/IR/AMDAIEDialect.td" include "iree-amd-aie/IR/AMDAIEDmaOpInterface.td" +include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.td" include "iree-amd-aie/IR/AMDAIETypes.td" //===----------------------------------------------------------------------===// @@ -208,8 +209,8 @@ def AMDAIE_BdIdOp: AMDAIE_Op<"bd_id", [ // IREE AMDAIE Npu Ops //===----------------------------------------------------------------------===// -def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", - [AttrSizedOperandSegments, DoublyStridedOpInterface]>, +def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [ + AttrSizedOperandSegments, DoublyStridedOpInterface]>, Results<(outs Index)> { let summary = "The Npu uController's dma operator"; let description = [{ @@ -239,58 +240,50 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", let arguments = ( ins Index:$dma, + Optional:$target, Variadic:$target_offsets, Variadic:$target_sizes, Variadic:$target_strides, DenseI64ArrayAttr:$target_static_offsets, DenseI64ArrayAttr:$target_static_sizes, DenseI64ArrayAttr:$target_static_strides, + Optional:$target_bd_id, + Optional:$source, Variadic:$source_offsets, Variadic:$source_sizes, Variadic:$source_strides, DenseI64ArrayAttr:$source_static_offsets, DenseI64ArrayAttr:$source_static_sizes, DenseI64ArrayAttr:$source_static_strides, - Optional:$target_bd_id, Optional:$source_bd_id ); - let assemblyFormat = [{ - $dma - `(` - custom($target_offsets, $target_static_offsets) - custom($target_sizes, $target_static_sizes) - custom($target_strides, $target_static_strides) - (`bd_id` `=` $target_bd_id^)? - `,` - custom($source_offsets, $source_static_offsets) - custom($source_sizes, $source_static_sizes) - custom($source_strides, $source_static_strides) - (`bd_id` `=` $source_bd_id^)? - `)` - attr-dict - }]; + // Use a custom assembly format because of weird spaces being inserted around + // the optional `target` by the default assembly format generator. + let hasCustomAssemblyFormat = 1; let builders = [ // Build a NpuDmaCpyNdOp with mixed static and dynamic entries. - OpBuilder<(ins "Value":$dma, "ArrayRef":$target_offsets, + OpBuilder<(ins "Value":$dma, "::mlir::Value":$target, + "ArrayRef":$target_offsets, "ArrayRef":$target_sizes, - "ArrayRef":$target_strides, - "ArrayRef":$source_offsets, + "ArrayRef":$target_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source, "ArrayRef":$source_offsets, "ArrayRef":$source_sizes, - "ArrayRef":$source_strides, "::mlir::Value":$target_bd_id, - "::mlir::Value":$source_bd_id)>, + "ArrayRef":$source_strides, "::mlir::Value":$source_bd_id)>, // Build a NpuDmaCpyNdOp with static entries. - OpBuilder<(ins "Value":$dma, "ArrayRef":$target_offsets, - "ArrayRef":$target_sizes, "ArrayRef":$target_strides, - "ArrayRef":$source_offsets, "ArrayRef":$source_sizes, - "ArrayRef":$source_strides, "::mlir::Value":$target_bd_id, + OpBuilder<(ins "Value":$dma, "::mlir::Value":$target, + "ArrayRef":$target_offsets, "ArrayRef":$target_sizes, + "ArrayRef":$target_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source, "ArrayRef":$source_offsets, + "ArrayRef":$source_sizes, "ArrayRef":$source_strides, "::mlir::Value":$source_bd_id)>, // Build a NpuDmaCpyNdOp with dynamic entries. - OpBuilder<(ins "Value":$dma, "ValueRange":$target_offsets, - "ValueRange":$target_sizes, "ValueRange":$target_strides, - "ValueRange":$source_offsets, "ValueRange":$source_sizes, - "ValueRange":$source_strides, "::mlir::Value":$target_bd_id, + OpBuilder<(ins "Value":$dma, "::mlir::Value":$target, + "ValueRange":$target_offsets, "ValueRange":$target_sizes, + "ValueRange":$target_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source, "ValueRange":$source_offsets, + "ValueRange":$source_sizes, "ValueRange":$source_strides, "::mlir::Value":$source_bd_id)> ]; @@ -523,7 +516,8 @@ def AMDAIE_LogicalObjectFifoAcquire: } def AMDAIE_LogicalObjectFifoFromMemrefOp - : AMDAIE_Op<"logicalobjectfifo.from_memref", [Pure]> { + : AMDAIE_Op<"logicalobjectfifo.from_memref", + [LogicalObjFifoOpInterface, Pure]> { let summary = "Create a logical objectFifo from a memref"; let description = [{ Creates a logical objectFifo which encapsulates a memref. The logical objectFifo @@ -631,6 +625,46 @@ def AMDAIE_LogicalObjectFifoLink }]; } +def AMDAIE_LogicalObjectFifoPlaceholderOp: + AMDAIE_Op<"logicalobjectfifo.placeholder", [ + LogicalObjFifoOpInterface, Pure]> { + let summary = "A placeholder for a logical objectFifo."; + let description = [{ + Represents a placeholder for a logical objectFifo. The actual logical + objectFifo can then be provided later. This is useful for creating static + connections (`amdaie.circular_dma_cpy_nd`) that can be reused for different + logical objectFifos. + + Example: + ```mlir + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) + binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1024xi32> + %alloc = memref.alloc() : memref<1024xi32, 1 : i32> + %obj0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_1} + : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo> + %ph = amdaie.logicalobjectfifo.placeholder{} + : !amdaie.logicalobjectfifo> + %connection = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %ph[] [] []) + : (!amdaie.logicalobjectfifo>, + !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %obj1 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} + : memref<1024xi32> -> !amdaie.logicalobjectfifo> + %npu_dma = amdaie.npu.dma_cpy_nd %connection([] [] [], + %obj0[%c0, %c32] [%c32, %c32] [%c32, %c1]) + : source_type = !amdaie.logicalobjectfifo> + amdaie.end + } + ``` + }]; + + let arguments = (ins Variadic:$tiles); + + let results = (outs AnyAMDAIELogicalObjectFifoType:$output); + + let assemblyFormat = [{ `{` $tiles `}` attr-dict `:` type($output)}]; +} + def AMDAIE_LogicalObjectFifoRelease: AMDAIE_Op<"logicalobjectfifo.release", []> { let summary = "Semaphore operation to release objects from a logical" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt index 11d57f637..9e07f4f03 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt @@ -14,6 +14,7 @@ iree_cc_library( HDRS "AMDAIEAttrs.h" "AMDAIEDialect.h" + "AMDAIELogicalObjFifoOpInterface.h" "AMDAIEOps.h" "AMDAIETypes.h" TEXTUAL_HDRS @@ -29,15 +30,19 @@ iree_cc_library( "AMDAIEOps.h.inc" "AMDAIEDmaOpInterface.cpp.inc" "AMDAIEDmaOpInterface.h.inc" + "AMDAIELogicalObjFifoOpInterface.h.inc" + "AMDAIELogicalObjFifoOpInterface.cpp.inc" SRCS "AMDAIEAttrs.cpp" "AMDAIEDmaOpInterface.cpp" "AMDAIEDialect.cpp" + "AMDAIELogicalObjFifoOpInterface.cpp" "AMDAIEOps.cpp" "AMDAIETypes.cpp" DEPS ::AMDAIEDialectGen ::AMDAIEDmaOpInterfaceGen + ::AMDAIELogicalObjFifoOpInterface ::AMDAIEOpsGen ::AMDAIETypesGen ::AMDAIEAttrsGen @@ -104,3 +109,13 @@ iree_tablegen_library( --gen-op-interface-decls AMDAIEDmaOpInterface.h.inc --gen-op-interface-defs AMDAIEDmaOpInterface.cpp.inc ) + +iree_tablegen_library( + NAME + AMDAIELogicalObjFifoOpInterface + TD_FILE + "AMDAIELogicalObjFifoOpInterface.td" + OUTS + --gen-op-interface-decls AMDAIELogicalObjFifoOpInterface.h.inc + --gen-op-interface-defs AMDAIELogicalObjFifoOpInterface.cpp.inc +) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index d5d85ede0..e740db7c3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -163,6 +163,22 @@ func.func @logicalobjectfifo_link(%arg0: !amdaie.logicalobjectfifo> +// CHECK: %{{.+}} = amdaie.logicalobjectfifo.placeholder{%[[tile_0_0]]} : !amdaie.logicalobjectfifo> +func.func @logicalobjectfifo_placeholder() { + %c0 = arith.constant 0 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + %placeholder_0 = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> + %placeholder_1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + return +} + + // ----- // CHECK-LABEL: func.func @logicalobjectfifo_release @@ -265,6 +281,45 @@ func.func @npu_dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo>, %[[ARG1:.+]]: !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd %[[DMA0]](%[[ARG0]][] [] [], %[[ARG1]][] [] []) : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> +func.func @npu_dma_cpy_nd_target_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.npu.dma_cpy_nd %0(%arg0[] [] [], %arg1[] [] []) : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> + return +} + +// ----- + +// CHECK-LABEL: func.func @npu_dma_cpy_nd_all_operands +// CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo>, %[[ARG1:.+]]: !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd %[[DMA0]] +// CHECK-SAME: %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C128]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]] +// CHECK-SAME: %[[ARG1]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C16]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]] +// CHECK-SAME: : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> +func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %tile = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile, 0) + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.npu.dma_cpy_nd %0(%arg0[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c128, %c16, 1] bd_id = %bd_id, %arg1[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c16, %c16, 1] bd_id = %bd_id) : target_type = !amdaie.logicalobjectfifo> source_type = !amdaie.logicalobjectfifo> + return +} + +// ----- + // CHECK-LABEL: func.func @workgroup // CHECK: amdaie.workgroup // CHECK: amdaie.core diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp index 3e683d50d..7ef24032d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp @@ -64,9 +64,14 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); WalkResult res = controlCodeOp->walk([&](Operation *op) { if (auto npuDmaOp = dyn_cast(op)) { - AMDAIE::CircularDmaCpyNdOp inputDma = npuDmaOp.getDmaCpyNdOp(); - if (npuDmaOp.getSourceMemorySpaceAsUInt() == 0) { - SmallVector tiles = inputDma.getSourceObjectFifo().getTiles(); + if (npuDmaOp.getSource()) { + auto logicalObjFifo = dyn_cast( + npuDmaOp.getSource().getDefiningOp()); + if (!logicalObjFifo) { + npuDmaOp.emitOpError() << "expected a source logical objectFifo"; + return WalkResult::interrupt(); + } + SmallVector tiles = logicalObjFifo.getTiles(); AMDAIE::TileOp tileOp; if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) return WalkResult::interrupt(); @@ -83,13 +88,22 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { tileOp, bdId.value()); rewriter.setInsertionPoint(npuDmaOp); npuDmaOp = rewriter.replaceOpWithNewOp( - npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTargetMixedOffsets(), - npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), - npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), - npuDmaOp.getSourceMixedStrides(), npuDmaOp.getTargetBdId(), bdIdOp); + npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTarget(), + npuDmaOp.getTargetMixedOffsets(), npuDmaOp.getTargetMixedSizes(), + npuDmaOp.getTargetMixedStrides(), npuDmaOp.getTargetBdId(), + npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), + npuDmaOp.getSourceMixedSizes(), npuDmaOp.getSourceMixedStrides(), + bdIdOp); } - if (npuDmaOp.getTargetMemorySpaceAsUInt() == 0) { - SmallVector tiles = inputDma.getTargetObjectFifo().getTiles(); + if (npuDmaOp.getTarget()) { + auto logicalObjFifo = dyn_cast( + npuDmaOp.getTarget().getDefiningOp()); + if (!logicalObjFifo) { + npuDmaOp.emitOpError() + << "expected a target `amdaie.logicalobjectfifo.from_memref`"; + return WalkResult::interrupt(); + } + SmallVector tiles = logicalObjFifo.getTiles(); AMDAIE::TileOp tileOp; if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) return WalkResult::interrupt(); @@ -106,10 +120,11 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { tileOp, bdId.value()); rewriter.setInsertionPoint(npuDmaOp); (void)rewriter.replaceOpWithNewOp( - npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTargetMixedOffsets(), - npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), + npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTarget(), + npuDmaOp.getTargetMixedOffsets(), npuDmaOp.getTargetMixedSizes(), + npuDmaOp.getTargetMixedStrides(), bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), - npuDmaOp.getSourceMixedStrides(), bdIdOp, npuDmaOp.getSourceBdId()); + npuDmaOp.getSourceMixedStrides(), npuDmaOp.getSourceBdId()); } return WalkResult::advance(); } else if (auto npuWaitOp = dyn_cast(op)) { @@ -154,7 +169,7 @@ class AMDAIEAssignNpuDmaBdIdsPass } AMDAIEAssignNpuDmaBdIdsPass() = default; - AMDAIEAssignNpuDmaBdIdsPass(const AMDAIEAssignNpuDmaBdIdsPass &pass){}; + AMDAIEAssignNpuDmaBdIdsPass(const AMDAIEAssignNpuDmaBdIdsPass &pass) {}; void runOnOperation() override; }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp index c446df605..6ed1e4777 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp @@ -114,14 +114,28 @@ LogicalResult WorkgroupBuilder::buildForDmaCpyNdOp( SmallVector npuDmaSourceOffsets = dmaOp.getSourceMixedOffsets(); SmallVector npuDmaSourceSizes = dmaOp.getSourceMixedSizes(); SmallVector npuDmaSourceStrides = dmaOp.getSourceMixedStrides(); + Value circularDmaTarget, circularDmaSource, npuDmaTarget, npuDmaSource; if (!sourceMemSpace) { // Check if the source of DmaCpyNd op is from L3 - then source addressing // will be controlled by the uController and target addressing will stay in // the circular DMA to be part of the AIE configuration. + auto logicalObjFifo = dyn_cast( + dmaOp.getSource().getDefiningOp()); + if (!logicalObjFifo) { + return dmaOp.emitOpError() + << "`amdaie.logicalobjectfifo.from_memref` expected as source"; + } + auto type = cast(dmaOp.getSource().getType()); + auto placeholder = + rewriter.createAndLookup( + rewriter.getUnknownLoc(), type, logicalObjFifo.getTiles()); + circularDmaSource = placeholder.getResult(); + circularDmaTarget = dmaOp.getTarget(); circularDmaTargetOffsets = npuDmaTargetOffsets; circularDmaTargetSizes = npuDmaTargetSizes; circularDmaTargetStrides = npuDmaTargetStrides; + npuDmaSource = dmaOp.getSource(); npuDmaTargetOffsets = empty; npuDmaTargetSizes = empty; npuDmaTargetStrides = empty; @@ -129,26 +143,40 @@ LogicalResult WorkgroupBuilder::buildForDmaCpyNdOp( // Check if the target of DmaCpyNd op is from L3 - then target addressing // will be controlled by the uController and source addressing will stay in // the circular DMA to be part of the AIE configuration. + auto logicalObjFifo = dyn_cast( + dmaOp.getTarget().getDefiningOp()); + if (!logicalObjFifo) { + return dmaOp.emitOpError() + << "`amdaie.logicalobjectfifo.from_memref` expected as source"; + } + auto type = cast(dmaOp.getTarget().getType()); + auto placeholder = + rewriter.createAndLookup( + rewriter.getUnknownLoc(), type, logicalObjFifo.getTiles()); + circularDmaSource = dmaOp.getSource(); + circularDmaTarget = placeholder.getResult(); circularDmaSourceOffsets = npuDmaSourceOffsets; circularDmaSourceSizes = npuDmaSourceSizes; circularDmaSourceStrides = npuDmaSourceStrides; + npuDmaTarget = dmaOp.getTarget(); npuDmaSourceOffsets = empty; npuDmaSourceSizes = empty; npuDmaSourceStrides = empty; } auto newDmaOp = rewriter.createAndMap( - rewriter.getUnknownLoc(), dmaOp, dmaOp.getTarget(), + rewriter.getUnknownLoc(), dmaOp, circularDmaTarget, circularDmaTargetOffsets, circularDmaTargetSizes, - circularDmaTargetStrides, dmaOp.getSource(), circularDmaSourceOffsets, + circularDmaTargetStrides, circularDmaSource, circularDmaSourceOffsets, circularDmaSourceSizes, circularDmaSourceStrides); IRRewriter::InsertPoint dmaInsertionPoint = rewriter.saveInsertionPoint(); controlCodeRewriter.setInsertionPoint(controlCode, controlCodeEnd); auto npuDmaCpy = controlCodeRewriter.createAndLookup( - loc, newDmaOp.getResult(), npuDmaTargetOffsets, npuDmaTargetSizes, - npuDmaTargetStrides, npuDmaSourceOffsets, npuDmaSourceSizes, - npuDmaSourceStrides, nullptr, nullptr); + loc, newDmaOp.getResult(), npuDmaTarget, npuDmaTargetOffsets, + npuDmaTargetSizes, npuDmaTargetStrides, /*target_bd_id=*/nullptr, + npuDmaSource, npuDmaSourceOffsets, npuDmaSourceSizes, npuDmaSourceStrides, + /*source_bd_id=*/nullptr); DMAChannelDir direction = !sourceMemSpace ? DMAChannelDir::MM2S : DMAChannelDir::S2MM; controlCodeRewriter.createAndLookup( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp index f9225e612..cfd347313 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateLogicalObjectFifoLink.cpp @@ -88,15 +88,15 @@ LogicalResult createLogicalObjectFifoLink( auto sourceLogicalObjectFifo = dyn_cast( stridedOp.getSource().getDefiningOp()); - if (!sourceLogicalObjectFifo) { - stridedOp->emitError( - "does not have a `LogicalObjectFifoFromMemrefOp` as source"); - return failure(); - } if (!lastUserOp || lastUserOp->isBeforeInBlock(stridedOp)) { lastUserOp = stridedOp; } - if (logicalObjectFifo == sourceLogicalObjectFifo) { + // The `sourceLogicalObjectFifo` could be either a + // `LogicalObjectFifoFromMemrefOp` or `LogicalObjectFifoPlaceholderOp`, + // but currently the linking only works with + // `LogicalObjectFifoFromMemrefOp` on L2. + if (sourceLogicalObjectFifo && + logicalObjectFifo == sourceLogicalObjectFifo) { if (std::optional offset = stridedOp.getSourceStaticBaseOffset()) { outs.push_back(std::make_pair(stridedOp, offset.value())); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 6fccc017d..284b297c9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -460,8 +460,13 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, int &dmaId) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::CircularDmaCpyNdOp]\n"); rewriter.setInsertionPointToEnd(deviceBlock); + if (!dmaOp.getSource()) return dmaOp.emitOpError() << "expected a source"; + auto sourceLogicalObjFifo = dyn_cast( + dmaOp.getSource().getDefiningOp()); + if (!sourceLogicalObjFifo) + return dmaOp.emitOpError() << "expected a logical objectFifo source"; SmallVector newSourceTiles = - llvm::map_to_vector(dmaOp.getSourceObjectFifo().getTiles(), + llvm::map_to_vector(sourceLogicalObjFifo.getTiles(), [&](Value tile) { return mapper.lookup(tile); }); if (newSourceTiles.size() != 1) { return dmaOp.emitError() @@ -469,8 +474,14 @@ LogicalResult circularDmaToAIE(IRRewriter &rewriter, "`ObjectFifoCreateOp` only handles a single source tile for now."; } Value newSourceTile = newSourceTiles[0]; + + if (!dmaOp.getTarget()) return dmaOp.emitOpError() << "expected a source"; + auto targetLogicalObjFifo = dyn_cast( + dmaOp.getTarget().getDefiningOp()); + if (!targetLogicalObjFifo) + return dmaOp.emitOpError() << "expected a logical objectFifo source"; SmallVector newTargetTiles = - llvm::map_to_vector(dmaOp.getTargetObjectFifo().getTiles(), + llvm::map_to_vector(targetLogicalObjFifo.getTiles(), [&](Value tile) { return mapper.lookup(tile); }); auto symName = "obj" + std::to_string(dmaId++); @@ -563,7 +574,13 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, IRMapping &mapper, IRMapping &bindingsMapper) { rewriter.setInsertionPoint(dmaOp); // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves. - if (dmaOp.getSourceMemorySpaceAsUInt() == 0) { + if (dmaOp.getSource()) { + auto sourceLogicalObjFifo = dyn_cast( + dmaOp.getSource().getDefiningOp()); + if (!sourceLogicalObjFifo) { + return dmaOp.emitOpError() << "expected source to be an " + "`amdaie.logicalobjectfifo.from_memref`"; + } if (!dmaOp.hasSourceAddressing()) { return dmaOp.emitOpError() << "expected source addressing for DMA with source on L3"; @@ -590,8 +607,7 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, } AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); - Value memref = - bindingsMapper.lookup(dmaCpyNd.getSourceObjectFifo().getMemref()); + Value memref = bindingsMapper.lookup(sourceLogicalObjFifo.getMemref()); auto objFifo = dyn_cast( mapper.lookup(dmaCpyNd.getOperation())); if (!objFifo) { @@ -604,7 +620,13 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, empty, empty, staticOffsets, staticSizes, staticStrides, objFifo.getName(), bdIdOp.getValue(), issueToken); } - if (dmaOp.getTargetMemorySpaceAsUInt() == 0) { + if (dmaOp.getTarget()) { + auto targetLogicalObjFifo = dyn_cast( + dmaOp.getTarget().getDefiningOp()); + if (!targetLogicalObjFifo) { + return dmaOp.emitOpError() << "expected target to be an " + "`amdaie.logicalobjectfifo.from_memref`"; + } if (!dmaOp.hasTargetAddressing()) { return dmaOp.emitOpError() << "expected target addressing for DMA with target on L3"; @@ -631,8 +653,7 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, } AMDAIE::CircularDmaCpyNdOp dmaCpyNd = dmaOp.getDmaCpyNdOp(); - Value memref = - bindingsMapper.lookup(dmaCpyNd.getTargetObjectFifo().getMemref()); + Value memref = bindingsMapper.lookup(targetLogicalObjFifo.getMemref()); auto objFifo = dyn_cast( mapper.lookup(dmaCpyNd.getOperation())); if (!objFifo) { @@ -786,7 +807,7 @@ LogicalResult tileToAIE(IRRewriter &rewriter, AMDAIE::TileOp tileOp, LogicalResult workgroupToAIE(IRRewriter &rewriter, AMDAIE::WorkgroupOp workgroupOp, xilinx::AIE::DeviceOp deviceOp, - xilinx::AIEX::RuntimeSequenceOp ipuFuncOp, + xilinx::AIEX::RuntimeSequenceOp npuFuncOp, IRMapping &mapper, IRMapping &bindingsMapper) { OpBuilder::InsertionGuard guard(rewriter); Block *deviceBlock = &deviceOp.getRegion().front(); @@ -810,7 +831,7 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter, return WalkResult::advance(); }) .Case([&](auto controlCodeOp) { - if (failed(controlCodeToAie(rewriter, controlCodeOp, ipuFuncOp, + if (failed(controlCodeToAie(rewriter, controlCodeOp, npuFuncOp, mapper, bindingsMapper))) { controlCodeOp.emitError("could not convert to AIEDialect ops"); return WalkResult::interrupt(); @@ -897,13 +918,13 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { return a.getBinding().getZExtValue() < b.getBinding().getZExtValue(); }); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); - auto ipuFuncOp = rewriter.create( + auto npuFuncOp = rewriter.create( rewriter.getUnknownLoc(), rewriter.getStringAttr(funcOp.getSymName())); - ipuFuncOp.getBody().push_back(new Block); + npuFuncOp.getBody().push_back(new Block); for (int i = 0, e = subspanOps.size(); i < e; i++) { auto a = subspanOps[i].getResult(); - ipuFuncOp.getBody().addArgument(a.getType(), a.getLoc()); - bindingsMapper.map(a, ipuFuncOp.getBody().getArgument(i)); + npuFuncOp.getBody().addArgument(a.getType(), a.getLoc()); + bindingsMapper.map(a, npuFuncOp.getBody().getArgument(i)); } // Walk the AIE regions ops and convert ops into pure AIEDialect ops. @@ -913,7 +934,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { if (isa(op)) { return WalkResult::advance(); } else if (auto workgroupOp = dyn_cast(op)) { - if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, ipuFuncOp, + if (failed(workgroupToAIE(rewriter, workgroupOp, deviceOp, npuFuncOp, mapper, bindingsMapper))) { return WalkResult::interrupt(); } @@ -928,7 +949,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { if (res.wasInterrupted()) return WalkResult::interrupt(); // Move NPU instruction function to the end of the device block. - rewriter.moveOpBefore(ipuFuncOp, deviceBlock, deviceBlock->end()); + rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end()); // After walking the FuncOp, it has been converted into a DeviceOp and can // safely be erased. eraseOp(rewriter, mapper, funcOp); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir index 8cb4ba4e7..8d0eba8fe 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir @@ -14,11 +14,12 @@ module { // CHECK-LABEL: @single_dma_cpy_nd_on_source // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -29,11 +30,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, MM2S) amdaie.end } @@ -47,11 +49,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @single_dma_cpy_nd_on_target // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]](%[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -62,11 +65,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_0[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) + %1 = amdaie.npu.dma_cpy_nd %0(%from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, S2MM) amdaie.end } @@ -82,19 +86,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[C1:.+]] = arith.constant 1 : index // CHECK: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) -// CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) -// CHECK: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) +// CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) @@ -110,17 +117,20 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) - %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0] [8, 16] [16, 1]) - %2 = amdaie.npu.dma_cpy_nd %dma2([] [] [], [0] [128] [1]) + %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], %from_memref_1[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd %dma2([] [] [], %from_memref_2[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%0, MM2S) amdaie.npu.dma_wait(%1, MM2S) amdaie.npu.dma_wait(%2, MM2S) @@ -136,15 +146,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @multiple_dma_cpy_with_bd_id_reuse // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup -// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -155,15 +166,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, MM2S) - %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [8, 16] [16, 1]) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%2, MM2S) - %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [128] [1]) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%3, MM2S) amdaie.end } @@ -181,11 +193,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) // CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1) // CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], 2) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) @@ -198,13 +211,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) - %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [8, 16] [16, 1]) - %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [128] [1]) + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0, 0] [8, 16] [16, 1]) : source_type = !amdaie.logicalobjectfifo> + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%1, MM2S) amdaie.npu.dma_wait(%2, MM2S) amdaie.npu.dma_wait(%3, MM2S) @@ -231,19 +245,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 1) // CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) // CHECK-DAG: %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) // CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) -// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_0_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) +// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0_1]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_3]], MM2S) -// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [] [] [] bd_id = %[[BD_ID_2_0]]) +// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_4]], MM2S) // CHECK: } // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) @@ -262,23 +279,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_1_0 = amdaie.tile(%c1, %c0) %tile_2_0 = amdaie.tile(%c2, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2_0} : !amdaie.logicalobjectfifo> %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) + %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], %from_memref_0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> scf.forall (%arg4, %arg5) in (2, 2) { - %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo> scf.for %arg6 = %c0 to %c6 step %c1 { - %2 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0] [1, 128] [128, 1]) - %3 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0] [128] [1]) + %2 = amdaie.npu.dma_cpy_nd %dma1([] [] [], %from_memref_1[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo> + %3 = amdaie.npu.dma_cpy_nd %dma0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%2, MM2S) amdaie.npu.dma_wait(%3, MM2S) - %4 = amdaie.npu.dma_cpy_nd %dma2([] [] [], [] [] []) + %4 = amdaie.npu.dma_cpy_nd %dma2([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%4, MM2S) } amdaie.npu.dma_wait(%1, MM2S) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir index 94c882d51..5d5ff3997 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/create_aie_workgroup.mlir @@ -60,19 +60,19 @@ func.func @core() { // ----- // CHECK-LABEL: @dma_cpy_nd_L3_L2 +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32, 1>, %[[ARG1:.+]]: memref<8x16xi32> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]], {} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @dma_cpy_nd_L3_L2(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> @@ -84,19 +84,19 @@ func.func @dma_cpy_nd_L3_L2(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi // ----- // CHECK-LABEL: @dma_cpy_nd_L3_L2_target_addressing +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32, 1>, %[[ARG1:.+]]: memref<8x16xi32> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]], {} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @dma_cpy_nd_L3_L2_target_addressing(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> @@ -108,19 +108,17 @@ func.func @dma_cpy_nd_L3_L2_target_addressing(%arg0: memref<1x1x8x16xi32, 1>, %a // ----- // CHECK-LABEL: @dma_cpy_nd_L2_L3 +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] +// CHECK-SAME: %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]], {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]](%[[FROMMEMREF1]][] [] [], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) func.func @dma_cpy_nd_L2_L3(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> @@ -132,18 +130,18 @@ func.func @dma_cpy_nd_L2_L3(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, // ----- // CHECK-LABEL: @dma_cpy_nd_L2_L3_target_addressing +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] +// CHECK-SAME: %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]], {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] // CHECK-SAME: [] [] [] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) func.func @dma_cpy_nd_L2_L3_target_addressing(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { @@ -226,26 +224,26 @@ func.func @for_cores() { // Verify that scf.for is inserted in control code with nested dmas. // // CHECK-LABEL: @for_dma +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1_1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8_1:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // CHECK: scf.for %[[ARG:.+]] = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, 16, %[[ARG]], 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, %[[ARG]], 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @for_dma(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { %c0 = arith.constant 0 : index @@ -308,20 +306,20 @@ func.func @forall_cores() { // Verify that scf.forall is inserted in control code with nested dmas. // // CHECK-LABEL: @forall_dmas +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 1> // CHECK: amdaie.workgroup -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] // CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][] [] [] +// CHECK-SAME: %[[PLACEHOLDER]][] [] [] // CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: amdaie.controlcode +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (2, 2) // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] // CHECK-SAME: [] [] [] -// CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 16] [128, %[[ARG1]], %[[ARG0]], 1] +// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, %[[ARG1]], %[[ARG0]], 1] // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) func.func @forall_dmas(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> @@ -337,57 +335,62 @@ func.func @forall_dmas(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) // Verify that cores on the same location, but within different scope merge correctly. // // CHECK-LABEL: @merge_cores +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 2> // CHECK: amdaie.workgroup // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[TILE_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) +// CHECK-DAG: %[[PLACEHOLDER:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] +// CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF0]][] [] [], %[[PLACEHOLDER]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[PLACEHOLDER2:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK: %[[DMA2:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF0]][] [] [], %[[PLACEHOLDER2]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_0]], in : [%[[DMA]], %[[DMA2]]], out : []) // CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_1]], in : [%[[DMA]], %[[DMA2]]], out : []) // CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) // CHECK: amdaie.controlcode // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1_1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8_1:.+]] = arith.constant 8 : index -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) -// CHECK: scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] -func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) { +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[DMA]]([] [] [], %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) +// CHECK: scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] { +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[DMA2]]([] [] [], %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +// CHECK: } +func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 2>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> - %2 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core_0_0_0 = amdaie.core(%tile_0_0, in : [%2], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } - %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + %core_0_1_0 = amdaie.core(%tile_0_1, in : [%2], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } scf.for %arg2 = %c0 to %c8 step %c1 { - %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) { + %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core_0_0_1 = amdaie.core(%tile_0_0, in : [%3], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } - %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) { + %core_0_1_1 = amdaie.core(%tile_0_1, in : [%3], out : []) { + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } } @@ -397,64 +400,47 @@ func.func @merge_cores(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>) // ----- // CHECK-LABEL: @complex_example +// CHECK-SAME: %[[ARG0:.+]]: memref<1x1x8x16xi32>, %[[ARG1:.+]]: memref<8x16xi32, 2>, %[[ARG2:.+]]: memref<1x1x16x16xi32>, %[[ARG3:.+]]: memref<16x16xi32, 2>, %[[ARG4:.+]]: memref<1x1x32x16xi32>, %[[ARG5:.+]]: memref<32x16xi32, 2> // CHECK: amdaie.workgroup // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[TILE_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF2:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x16x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF3:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<16x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF4:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<1x1x32x16xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[FROMMEMREF5:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK-SAME: memref<32x16xi32, 1> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF0]][] [] [] -// CHECK-SAME: %[[FROMMEMREF1]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF2]][] [] [] -// CHECK-SAME: %[[FROMMEMREF3]][0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %[[DMA2:.+]] = amdaie.circular_dma_cpy_nd -// CHECK-SAME: %[[FROMMEMREF4]][] [] [] -// CHECK-SAME: %[[FROMMEMREF5]][0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1] -// CHECK-SAME: (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) +// CHECK-DAG: %[[FROMMEMREF1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG1]] +// CHECK-DAG: %[[FROMMEMREF3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG3]] +// CHECK-DAG: %[[FROMMEMREF5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG5]] +// CHECK-DAG: %[[PLACEHOLDER0:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF1]][] [] [], %[[PLACEHOLDER0]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[PLACEHOLDER1:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF3]][] [] [], %[[PLACEHOLDER1]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[PLACEHOLDER2:.+]] = amdaie.logicalobjectfifo.placeholder{} : !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[DMA2:.+]] = amdaie.circular_dma_cpy_nd(%[[FROMMEMREF5]][] [] [], %[[PLACEHOLDER2]][] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_0]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF1]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF2]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF3]], Read) // CHECK: linalg.fill -// CHECK-DAG: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF0]], Read) +// CHECK: %{{.+}} = amdaie.core(%[[TILE_1]], in : [], out : []) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF1]], Read) // CHECK: scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]] -// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF4]], Read) +// CHECK: amdaie.logicalobjectfifo.access(%[[FROMMEMREF5]], Read) // CHECK: linalg.fill // CHECK: amdaie.controlcode +// CHECK-DAG: %[[FROMMEMREF0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG0]] +// CHECK-DAG: %[[FROMMEMREF2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG2]] +// CHECK-DAG: %[[FROMMEMREF4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ARG4]] // CHECK-DAG: %[[C0_1:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1_1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8_1:.+]] = arith.constant 8 : index -// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[DMA0]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], S2MM) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[DMA0]]([] [] [], %[[FROMMEMREF0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) // CHECK: scf.for %{{.*}} = %[[C0_1]] to %[[C8_1]] step %[[C1_1]] -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[DMA1]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], S2MM) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[DMA2]] -// CHECK-SAME: [] [] [] -// CHECK-SAME: [] [] [] -// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], S2MM) -func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 1>, %arg2: memref<1x1x16x16xi32>, %arg3: memref<16x16xi32, 1>, %arg4: memref<1x1x32x16xi32>, %arg5: memref<32x16xi32, 1>) { +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[DMA1]]([] [] [], %[[FROMMEMREF2]][0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[DMA2]]([] [] [], %[[FROMMEMREF4]][0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, 2>, %arg2: memref<1x1x16x16xi32>, %arg3: memref<16x16xi32, 2>, %arg4: memref<1x1x32x16xi32>, %arg5: memref<32x16xi32, 2>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index @@ -462,31 +448,31 @@ func.func @complex_example(%arg0: memref<1x1x8x16xi32>, %arg1: memref<8x16xi32, %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<1x1x8x16xi32> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %arg1, {} : memref<8x16xi32, 2> -> !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_memref %arg2, {} : memref<1x1x16x16xi32> -> !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_memref %arg3, {} : memref<16x16xi32, 1> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %arg3, {} : memref<16x16xi32, 2> -> !amdaie.logicalobjectfifo> %4 = amdaie.logicalobjectfifo.from_memref %arg4, {} : memref<1x1x32x16xi32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.logicalobjectfifo.from_memref %arg5, {} : memref<32x16xi32, 1> -> !amdaie.logicalobjectfifo> - %dma_0 = amdaie.dma_cpy_nd(%0[] [] [], %1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %arg5, {} : memref<32x16xi32, 2> -> !amdaie.logicalobjectfifo> + %dma_0 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_0_0 = amdaie.core(%tile_0_0, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } %core_0_1_0 = amdaie.core(%tile_0_1, in : [], out : []) { - amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32> + amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> amdaie.end } scf.for %iv0 = %c0 to %c8 step %c1 { - %dma_1 = amdaie.dma_cpy_nd(%2[] [] [], %3[0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_2 = amdaie.dma_cpy_nd(%4[] [] [], %5[0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_1 = amdaie.dma_cpy_nd(%3[] [] [], %2[0, 0, 0, 0] [1, 1, 16, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_2 = amdaie.dma_cpy_nd(%5[] [] [], %4[0, 0, 0, 0] [1, 1, 32, 16] [128, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_0_1 = amdaie.core(%tile_0_0, in : [], out : []) { - amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x16x16xi32> - linalg.fill ins(%c0_i32 : i32) outs(%arg2 : memref<1x1x16x16xi32>) + amdaie.logicalobjectfifo.access(%3, Read) : !amdaie.logicalobjectfifo> -> memref<16x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%arg3 : memref<16x16xi32, 2>) amdaie.end } %core_0_1_1 = amdaie.core(%tile_0_1, in : [], out : []) { - amdaie.logicalobjectfifo.access(%4, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x32x16xi32> - linalg.fill ins(%c0_i32 : i32) outs(%arg4 : memref<1x1x32x16xi32>) + amdaie.logicalobjectfifo.access(%5, Read) : !amdaie.logicalobjectfifo> -> memref<32x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%arg5 : memref<32x16xi32, 2>) amdaie.end } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 6766f04a4..9f5b2f6aa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -420,18 +420,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_2 = amdaie.tile(%c0, %c2) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{op expected to have a target BD ID op}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) amdaie.end } @@ -464,18 +465,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{op expected target addressing for DMA with target on L3}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [] bd_id = %bd_id_0, [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) amdaie.end } @@ -506,18 +508,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{could not canonicalize for AIE}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 0, 32] [1, 32, 2, 32] [0, 64, 0, 1] bd_id = %bd_id_0, [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [1, 32, 2, 32] [0, 64, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -547,18 +550,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{could not canonicalize for AIE}} - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [2, 8, 2, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -591,16 +595,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 32] [1, 2, 32] [0, 0, 1] bd_id = %bd_id_0, [] [] []) + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 32] [1, 2, 32] [0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -633,16 +638,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0, [] [] []) + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.end } } @@ -707,24 +713,25 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_source_l3 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_source_l3 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_0, S2MM) - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0] [%c1024] [%c1] bd_id = %bd_id_0, [] [] []) + %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[%c0] [%c1024] [%c1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_1, S2MM) - %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) + %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_2, MM2S) - %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [%c0] [%c2048] [%c1] bd_id = %bd_id_0) + %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], %obj0[%c0] [%c2048] [%c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_3, MM2S) amdaie.end } @@ -736,12 +743,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK: aie.device(npu1_4col) { -// CHECK: %[[TILE_0_0:.*]] = aie.tile(0, 0) -// CHECK: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK-DAG: %[[TILE_0_0:.*]] = aie.tile(0, 0) +// CHECK-DAG: %[[TILE_0_1:.*]] = aie.tile(0, 1) +// CHECK-DAG: %[[TILE_1_0:.*]] = aie.tile(1, 0) // CHECK: aie.objectfifo @[[OBJ0:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ1:.*]](%[[TILE_0_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> -// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] -// CHECK-SAME: %[[TILE_0_0]]}, 2 : i32) : !aie.objectfifo> +// CHECK: aie.objectfifo @[[OBJ1:.*]](%[[TILE_1_0]], {%[[TILE_0_1]]}, 2 : i32) : !aie.objectfifo> +// CHECK: aie.objectfifo @[[OBJ2:.*]](%[[TILE_0_1]] +// CHECK-SAME: {%[[TILE_1_0]]}, 2 : i32) : !aie.objectfifo> // CHECK: aiex.runtime_sequence @bf16_f32_lit_test // CHECK-SAME: (%[[LHS:.*]]: memref<32x32xbf16>, %[[RHS:.*]]: memref<32x32xbf16>, %[[OUT:.*]]: memref<32x32xf32>) { // CHECK: aiex.npu.dma_memcpy_nd @@ -784,23 +792,27 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x16xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> %tile_1 = amdaie.tile(%c0, %c0) + %tile_2 = amdaie.tile(%c1, %c0) %bd_id = amdaie.bd_id(%tile_1, 2) %bd_id_2 = amdaie.bd_id(%tile_1, 1) %bd_id_3 = amdaie.bd_id(%tile_1, 0) - %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_1} : !amdaie.logicalobjectfifo> memref.assume_alignment %3, 64 : memref<32x32xbf16> %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32xbf16> - %6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %placeholder1 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> memref.assume_alignment %5, 64 : memref<32x32xbf16> %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf32> - %8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> - %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %6[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %11 = amdaie.circular_dma_cpy_nd(%8[] [] [], %0[%c0, %c0, %c0, %c0] [%c2, %c16, %c2, %c16] [%c512, %c16, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + %placeholder2 = amdaie.logicalobjectfifo.placeholder{%tile_2} : !amdaie.logicalobjectfifo> + %9 = amdaie.circular_dma_cpy_nd(%2[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %10 = amdaie.circular_dma_cpy_nd(%1[] [] [], %placeholder1[] [] []) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %11 = amdaie.circular_dma_cpy_nd(%placeholder2[] [] [], %0[%c0, %c0, %c0, %c0] [%c2, %c16, %c2, %c16] [%c512, %c16, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { - %12 = amdaie.npu.dma_cpy_nd %11([%c0] [%c1024] [%c1] bd_id = %bd_id_3, [] [] []) - %13 = amdaie.npu.dma_cpy_nd %10([] [] [], [%c0, %c1, %c2] [%c2, %c32, %c16] [%c16, %c32, %c1] bd_id = %bd_id_2) - %14 = amdaie.npu.dma_cpy_nd %9([] [] [], [%c0] [%c1024] [%c1] bd_id = %bd_id) + %obj0 = amdaie.logicalobjectfifo.from_memref %3, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %obj1 = amdaie.logicalobjectfifo.from_memref %5, {%tile_1} : memref<32x32xbf16> -> !amdaie.logicalobjectfifo> + %obj2 = amdaie.logicalobjectfifo.from_memref %7, {%tile_1} : memref<32x32xf32> -> !amdaie.logicalobjectfifo> + %12 = amdaie.npu.dma_cpy_nd %11(%obj2[%c0] [%c1024] [%c1] bd_id = %bd_id_3, [] [] []) : target_type = !amdaie.logicalobjectfifo> + %13 = amdaie.npu.dma_cpy_nd %10([] [] [], %obj1[%c0, %c1, %c2] [%c2, %c32, %c16] [%c16, %c32, %c1] bd_id = %bd_id_2) : source_type = !amdaie.logicalobjectfifo> + %14 = amdaie.npu.dma_cpy_nd %9([] [] [], %obj0[%c0] [%c1024] [%c1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%12, S2MM) amdaie.npu.dma_wait(%13, MM2S) amdaie.npu.dma_wait(%14, MM2S) @@ -837,18 +849,19 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%placeholder[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo> // expected-error @+1 {{op expected target addressing for DMA with target on L3}} - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [] bd_id = %bd_id_0, [] [] []) + %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3(%obj0[] [] [] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma_1, S2MM) amdaie.end } @@ -931,10 +944,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.assume_alignment %0, 64 : memref<32x64xi32> %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %placeholder = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %placeholder[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.logicalobjectfifo.link[%dma0] -> [%dma1] () %core_0_2 = amdaie.core(%tile_0_2, in : [%dma1], out : []) { @@ -960,7 +973,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) + %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], %obj0[%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> amdaie.npu.dma_wait(%npu_dma, MM2S) amdaie.end }