diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignBufferDescriptorIDs.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignBufferDescriptorIDs.cpp index 16c559902..343fe9999 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignBufferDescriptorIDs.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEAssignBufferDescriptorIDs.cpp @@ -4,10 +4,12 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include #include "Passes.h" #include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/Pass/Pass.h" @@ -21,32 +23,99 @@ using namespace xilinx; using namespace xilinx::AIE; namespace mlir::iree_compiler::AMDAIE { -struct BdIdGenerator { - BdIdGenerator(int col, int row, AMDAIEDeviceModel &deviceModel) - : col(col), row(row), isMemTile(deviceModel.isMemTile(col, row)) {} - - int32_t nextBdId(int channelIndex) { - int32_t bdId = isMemTile && channelIndex & 1 ? oddBdId++ : evenBdId++; - while (bdIdAlreadyAssigned(bdId)) - bdId = isMemTile && channelIndex & 1 ? oddBdId++ : evenBdId++; - assignBdId(bdId); - return bdId; - } - void assignBdId(int32_t bdId) { - assert(!alreadyAssigned.count(bdId) && "bdId has already been assigned"); - alreadyAssigned.insert(bdId); - } +/// Assign BD ids to DMABDOp's in MemOps. +LogicalResult assignBdIds(DeviceOp deviceOp) { + AMDAIEDeviceModel deviceModel = mlir::iree_compiler::AMDAIE::getDeviceModel( + static_cast(deviceOp.getDevice())); + + ChannelBdIdGenerator shimChannelBdIdGenerator( + deviceModel.getChannelToValidBdIds(AMDAIETileType::SHIMNOC)); + ChannelBdIdGenerator memTileChannelBdIdGenerator( + deviceModel.getChannelToValidBdIds(AMDAIETileType::MEMTILE)); + + auto memOps = llvm::to_vector_of(deviceOp.getOps()); + llvm::append_range(memOps, deviceOp.getOps()); + llvm::append_range(memOps, deviceOp.getOps()); + for (TileElement memOp : memOps) { + int col = memOp.getTileID().col; + int row = memOp.getTileID().row; + + // BdIdGenerator gen(col, row, deviceModel); + ChannelBdIdGenerator gen = deviceModel.isMemTile(col, row) + ? memTileChannelBdIdGenerator + : shimChannelBdIdGenerator; + + memOp->walk([&](DMABDOp bd) { + if (bd.getBdId().has_value()) gen.assignBdId(bd.getBdId().value()); + }); + + DenseMap blockChannelMap; + // Associate with each block the channel index specified by the + // dma_start + for (Block &block : memOp.getOperation()->getRegion(0)) + for (auto op : block.getOps()) { + int chNum = op.getChannelIndex(); + blockChannelMap[&block] = chNum; + Block *dest = op.getDest(); + while (dest) { + blockChannelMap[dest] = chNum; + if (dest->hasNoSuccessors()) break; + dest = dest->getSuccessors()[0]; + if (blockChannelMap.contains(dest)) dest = nullptr; + } + } - bool bdIdAlreadyAssigned(int32_t bdId) { return alreadyAssigned.count(bdId); } + for (Block &block : memOp.getOperation()->getRegion(0)) { + if (block.getOps().empty()) continue; + assert(blockChannelMap.count(&block)); + DMABDOp bd = (*block.getOps().begin()); + if (bd.getBdId().has_value()) { + assert(gen.isBdIdAssigned(bd.getBdId().value()) && + "bdId assigned by user but not found during previous walk"); + } else { + std::optional bdId = + gen.getAndAssignBdId(blockChannelMap[&block]); + if (!bdId) + return memOp.emitOpError() + << "could not find and assign a valid BD id"; + bd.setBdId(bdId.value()); + } + } + } + for (TileElement memOp : memOps) { + DenseMap blockBdIdMap; + for (Block &block : memOp.getOperation()->getRegion(0)) { + if (block.getOps().empty()) continue; + DMABDOp bd = *block.getOps().begin(); + assert(bd.getBdId().has_value() && + "DMABDOp should have bd_id assigned by now"); + blockBdIdMap[&block] = bd.getBdId().value(); + } - int col; - int row; - int oddBdId = ODD_BD_ID_START; - int evenBdId = EVEN_BD_ID_START; - bool isMemTile; - std::set alreadyAssigned; -}; + for (Block &block : memOp.getOperation()->getRegion(0)) { + if (block.getOps().empty()) continue; + DMABDOp bd = *block.getOps().begin(); + std::optional nextBdId; + if (block.getNumSuccessors()) { + assert(llvm::range_size(block.getSuccessors()) == 1 && + "should have only one successor block"); + Block *nextBlock = block.getSuccessor(0); + if (!blockBdIdMap.contains(nextBlock)) + assert(nextBlock->getOperations().size() == 1 && + // for some reason i can't stick both of ops in a single + // isa<...> + (isa(nextBlock->getOperations().front()) || + isa(nextBlock->getOperations().front())) && + "bb that's not in blockMap can only have aie.end"); + else + nextBdId = blockBdIdMap[nextBlock]; + bd.setNextBdId(nextBdId); + } + } + } + return success(); +} struct AMDAIEAssignBufferDescriptorIDsPass : mlir::OperationPass { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( @@ -70,79 +139,7 @@ struct AMDAIEAssignBufferDescriptorIDsPass : mlir::OperationPass { void runOnOperation() override { DeviceOp deviceOp = getOperation(); - AMDAIEDeviceModel deviceModel = mlir::iree_compiler::AMDAIE::getDeviceModel( - static_cast(deviceOp.getDevice())); - - auto memOps = llvm::to_vector_of(deviceOp.getOps()); - llvm::append_range(memOps, deviceOp.getOps()); - llvm::append_range(memOps, deviceOp.getOps()); - for (TileElement memOp : memOps) { - int col = memOp.getTileID().col; - int row = memOp.getTileID().row; - - BdIdGenerator gen(col, row, deviceModel); - memOp->walk([&](DMABDOp bd) { - if (bd.getBdId().has_value()) gen.assignBdId(bd.getBdId().value()); - }); - - DenseMap blockChannelMap; - // Associate with each block the channel index specified by the - // dma_start - for (Block &block : memOp.getOperation()->getRegion(0)) - for (auto op : block.getOps()) { - int chNum = op.getChannelIndex(); - blockChannelMap[&block] = chNum; - Block *dest = op.getDest(); - while (dest) { - blockChannelMap[dest] = chNum; - if (dest->hasNoSuccessors()) break; - dest = dest->getSuccessors()[0]; - if (blockChannelMap.contains(dest)) dest = nullptr; - } - } - - for (Block &block : memOp.getOperation()->getRegion(0)) { - if (block.getOps().empty()) continue; - assert(blockChannelMap.count(&block)); - DMABDOp bd = (*block.getOps().begin()); - if (bd.getBdId().has_value()) - assert(gen.bdIdAlreadyAssigned(bd.getBdId().value()) && - "bdId assigned by user but not found during previous walk"); - else - bd.setBdId(gen.nextBdId(blockChannelMap[&block])); - } - } - for (TileElement memOp : memOps) { - DenseMap blockBdIdMap; - for (Block &block : memOp.getOperation()->getRegion(0)) { - if (block.getOps().empty()) continue; - DMABDOp bd = *block.getOps().begin(); - assert(bd.getBdId().has_value() && - "DMABDOp should have bd_id assigned by now"); - blockBdIdMap[&block] = bd.getBdId().value(); - } - - for (Block &block : memOp.getOperation()->getRegion(0)) { - if (block.getOps().empty()) continue; - DMABDOp bd = *block.getOps().begin(); - std::optional nextBdId; - if (block.getNumSuccessors()) { - assert(llvm::range_size(block.getSuccessors()) == 1 && - "should have only one successor block"); - Block *nextBlock = block.getSuccessor(0); - if (!blockBdIdMap.contains(nextBlock)) - assert(nextBlock->getOperations().size() == 1 && - // for some reason i can't stick both of ops in a single - // isa<...> - (isa(nextBlock->getOperations().front()) || - isa(nextBlock->getOperations().front())) && - "bb that's not in blockMap can only have aie.end"); - else - nextBdId = blockBdIdMap[nextBlock]; - bd.setNextBdId(nextBdId); - } - } - } + if (failed(assignBdIds(deviceOp))) signalPassFailure(); } }; diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt index 36bec5053..101042830 100644 --- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt @@ -170,6 +170,7 @@ iree_cc_library( ::AIEDialectIR ::AIEXDialectIR ::AIENormalizeAddressSpacesGen + iree::target::amd-aie::Utils::Utils ) add_subdirectory(test) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index 94acaa8c0..dd812930c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -26,6 +26,14 @@ void AMDAIEDialect::initializeAMDAIEOps() { >(); } +//===----------------------------------------------------------------------===// +// AMDAIE_BdIdOp +//===----------------------------------------------------------------------===// + +void BdIdOp::getAsmResultNames(function_ref setNameFn) { + setNameFn(getResult(), "bd_id"); +} + //===----------------------------------------------------------------------===// // AMDAIE_ControlCodeOp //===----------------------------------------------------------------------===// @@ -430,14 +438,16 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_NpuDmaCpyNdOp //===----------------------------------------------------------------------===// -// Build a NpuDmaCpyNdOp with mixed static and dynamic entries. +// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and +// source BD IDs. void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, ArrayRef targetOffsets, ArrayRef targetSizes, ArrayRef targetStrides, ArrayRef sourceOffsets, ArrayRef sourceSizes, - ArrayRef sourceStrides) { + ArrayRef sourceStrides, + mlir::Value targetBdId, mlir::Value sourceBdId) { SmallVector staticTargetOffsets, staticTargetSizes, staticTargetStrides; SmallVector staticSourceOffsets, staticSourceSizes, @@ -462,7 +472,7 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, dynamicTargetSizes, dynamicTargetStrides, staticTargetOffsets, staticTargetSizes, staticTargetStrides, dynamicSourceOffsets, dynamicSourceSizes, dynamicSourceStrides, staticSourceOffsets, - staticSourceSizes, staticSourceStrides); + staticSourceSizes, staticSourceStrides, targetBdId, sourceBdId); } // Build a NpuDmaCpyNdOp with static entries. @@ -472,7 +482,8 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, ArrayRef targetStrides, ArrayRef sourceOffsets, ArrayRef sourceSizes, - ArrayRef sourceStrides) { + ArrayRef sourceStrides, + mlir::Value targetBdId, mlir::Value sourceBdId) { SmallVector targetOffsetValues = llvm::to_vector<4>( llvm::map_range(targetOffsets, [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); @@ -499,14 +510,15 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, })); build(b, result, dma, targetOffsetValues, targetSizeValues, targetStrideValues, sourceOffsetValues, sourceSizeValues, - sourceStrideValues); + sourceStrideValues, targetBdId, sourceBdId); } // Build a NpuDmaCpyNdOp with dynamic entries. void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, ValueRange targetOffsets, ValueRange targetSizes, ValueRange targetStrides, ValueRange sourceOffsets, - ValueRange sourceSizes, ValueRange sourceStrides) { + ValueRange sourceSizes, ValueRange sourceStrides, + mlir::Value targetBdId, mlir::Value sourceBdId) { SmallVector targetOffsetValues = llvm::to_vector<4>(llvm::map_range( targetOffsets, [](Value v) -> OpFoldResult { return v; })); @@ -525,7 +537,7 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, sourceStrides, [](Value v) -> OpFoldResult { return v; })); build(b, result, dma, targetOffsetValues, targetSizeValues, targetStrideValues, sourceOffsetValues, sourceSizeValues, - sourceStrideValues); + sourceStrideValues, targetBdId, sourceBdId); } DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp( @@ -544,7 +556,8 @@ DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp( getValueOrCreateConstantIndexOp(rewriter, loc, newTargetStrides), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceOffsets), getValueOrCreateConstantIndexOp(rewriter, loc, newSourceSizes), - getValueOrCreateConstantIndexOp(rewriter, loc, newSourceStrides)); + getValueOrCreateConstantIndexOp(rewriter, loc, newSourceStrides), + getTargetBdId(), getSourceBdId()); return cast(newOp.getOperation()); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 5c518e3f6..e8a21f494 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -160,6 +160,45 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup", let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// IREE AMDAIE DMA Utility Ops +//===----------------------------------------------------------------------===// + +def AMDAIE_BdIdOp: AMDAIE_Op<"bd_id", [ + Pure, + DeclareOpInterfaceMethods + ]>, Results<(outs Index)> { + let summary = "Represents a physical buffer descriptor ID on an AIE tile."; + let description = [{ + This operation represents a buffer descriptor ID on an AIE tile. The buffer + descriptor ID is specified through a tile and an ID value, designating the + exact local buffer descriptor to be used on the tile. This op helps with + guaranteeing/verifying correct reuse of the same id across DMA operations. + + Background: DMAs are programmed through buffer descriptors (BDs) with each + one having a corresponding BD ID. For example, initially, a BD with ID 0 is + configured by some actor and afterwards a DMA is programmed to execute BD ID + 0. However, other DMAs can potentially reuse the same BD ID, resulting in + potential race conditions. For example, the BD with ID 0 on some AIE tile + could be reconfigured by some actor before having been fully executed by a + DMA, potentially leading to incorrect behaviour. + + Example: + + ```mlir + %tile = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile, 0) + ``` + }]; + + let arguments = ( + ins Index:$tile, + UI32Attr:$value + ); + + let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }]; +} + //===----------------------------------------------------------------------===// // IREE AMDAIE Npu Ops //===----------------------------------------------------------------------===// @@ -206,7 +245,9 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", Variadic:$source_strides, DenseI64ArrayAttr:$source_static_offsets, DenseI64ArrayAttr:$source_static_sizes, - DenseI64ArrayAttr:$source_static_strides + DenseI64ArrayAttr:$source_static_strides, + Optional:$target_bd_id, + Optional:$source_bd_id ); let assemblyFormat = [{ @@ -215,10 +256,12 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", custom($target_offsets, $target_static_offsets) custom($target_sizes, $target_static_sizes) custom($target_strides, $target_static_strides) + (`bd_id` `=` $target_bd_id^)? `,` custom($source_offsets, $source_static_offsets) custom($source_sizes, $source_static_sizes) custom($source_strides, $source_static_strides) + (`bd_id` `=` $source_bd_id^)? `)` attr-dict }]; @@ -230,17 +273,20 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", "ArrayRef":$target_strides, "ArrayRef":$source_offsets, "ArrayRef":$source_sizes, - "ArrayRef":$source_strides)>, + "ArrayRef":$source_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source_bd_id)>, // Build a NpuDmaCpyNdOp with static entries. OpBuilder<(ins "Value":$dma, "ArrayRef":$target_offsets, "ArrayRef":$target_sizes, "ArrayRef":$target_strides, "ArrayRef":$source_offsets, "ArrayRef":$source_sizes, - "ArrayRef":$source_strides)>, + "ArrayRef":$source_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source_bd_id)>, // Build a NpuDmaCpyNdOp with dynamic entries. OpBuilder<(ins "Value":$dma, "ValueRange":$target_offsets, "ValueRange":$target_sizes, "ValueRange":$target_strides, "ValueRange":$source_offsets, "ValueRange":$source_sizes, - "ValueRange":$source_strides)> + "ValueRange":$source_strides, "::mlir::Value":$target_bd_id, + "::mlir::Value":$source_bd_id)> ]; let extraClassDeclaration = [{ @@ -302,6 +348,18 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", return memSpace ? cast(memSpace).getInt() : 0; } + BdIdOp getSourceBdIdOp() { + Value bdIdValue = getSourceBdId(); + if (!bdIdValue || !bdIdValue.getDefiningOp()) return nullptr; + return dyn_cast(bdIdValue.getDefiningOp()); + } + + BdIdOp getTargetBdIdOp() { + Value bdIdValue = getTargetBdId(); + if (!bdIdValue || !bdIdValue.getDefiningOp()) return nullptr; + return dyn_cast(bdIdValue.getDefiningOp()); + } + // A utility to create a new doubly strided operation from this one with a // new set of source and target offsets, sizes and strides. DoublyStridedOpInterface createDoublyStridedOp( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 709a48676..82b20c05c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -1,5 +1,18 @@ // RUN: iree-opt --split-input-file %s | FileCheck %s +// CHECK-LABEL: func.func @bd_id +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], 0) +func.func @bd_id() { + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile, 0) + return +} + +// ----- + // CHECK-LABEL: func.func @core // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) @@ -210,6 +223,34 @@ func.func @npu_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %tile = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile, 0) + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.npu.dma_cpy_nd %0([%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1] bd_id = %bd_id, [%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1] bd_id = %bd_id) + return +} + +// ----- + // CHECK-LABEL: func.func @npu_dma_cpy_nd_inline_literals // CHECK: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp new file mode 100644 index 000000000..3e683d50d --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp @@ -0,0 +1,179 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" + +#define DEBUG_TYPE "iree-amdaie-assign-npu-dma-bd-ids" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Assign BD ids to NPU dma operations using the BD generator +LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { + IRRewriter rewriter(workgroupOp->getContext()); + + // Get the device model. + std::optional device = getConfigAMDAIEDevice(workgroupOp); + if (!device) + return workgroupOp->emitOpError() + << "could not find an AMDAIEDevice attribute"; + AMDAIEDeviceModel deviceModel = AMDAIE::getDeviceModel(device.value()); + + // Create a BD ID generator for every shim tile. + DenseMap shimTileToGeneratorMap; + workgroupOp->walk([&](AMDAIE::TileOp tileOp) { + std::optional col = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (col && row && deviceModel.isShimNOCTile(col.value(), row.value())) { + ChannelBdIdGenerator generator( + deviceModel.getChannelToValidBdIds(AMDAIETileType::SHIMNOC)); + shimTileToGeneratorMap[tileOp.getResult()] = std::move(generator); + } + }); + + // Utility to retrieve a TileOp from a vector of tile values, while doing + // appropriate verifications. + auto getGeneratorTileOp = [&](AMDAIE::NpuDmaCpyNdOp &npuDmaOp, + const SmallVector &tiles, + AMDAIE::TileOp &tileOp) -> LogicalResult { + if (tiles.size() != 1) { + return npuDmaOp.emitOpError() + << "operating on multiple tiles is not supported"; + } + Value tile = tiles[0]; + if (!shimTileToGeneratorMap.contains(tile)) { + return npuDmaOp.emitOpError() + << "no channel BD ID generator found for tile: " << tile; + } + tileOp = dyn_cast(tile.getDefiningOp()); + if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found"; + return success(); + }; + + // Walk `amdaie.npu_dma_cpy_nd` and `amdaie.dma_wait` operations and assign + // and release BD IDs when encountering the respective operations using the + // tile BD ID generators initialized earlier. + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + WalkResult res = controlCodeOp->walk([&](Operation *op) { + if (auto npuDmaOp = dyn_cast(op)) { + AMDAIE::CircularDmaCpyNdOp inputDma = npuDmaOp.getDmaCpyNdOp(); + if (npuDmaOp.getSourceMemorySpaceAsUInt() == 0) { + SmallVector tiles = inputDma.getSourceObjectFifo().getTiles(); + AMDAIE::TileOp tileOp; + if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) + return WalkResult::interrupt(); + ChannelBdIdGenerator &generator = + shimTileToGeneratorMap[tileOp.getResult()]; + // TODO(jornt): Temporarily use channel 0 for all DMAs. This should + // return correct results for Shim channels, however, for generality + // towards other DMAs and future hardware generations, channel + // assignment should happen before BD assignemnt. This requires more + // refactoring. + std::optional bdId = generator.getAndAssignBdId(0); + rewriter.setInsertionPointAfter(tileOp); + auto bdIdOp = rewriter.create(rewriter.getUnknownLoc(), + tileOp, bdId.value()); + rewriter.setInsertionPoint(npuDmaOp); + npuDmaOp = rewriter.replaceOpWithNewOp( + npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTargetMixedOffsets(), + npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), + npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), + npuDmaOp.getSourceMixedStrides(), npuDmaOp.getTargetBdId(), bdIdOp); + } + if (npuDmaOp.getTargetMemorySpaceAsUInt() == 0) { + SmallVector tiles = inputDma.getTargetObjectFifo().getTiles(); + AMDAIE::TileOp tileOp; + if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) + return WalkResult::interrupt(); + ChannelBdIdGenerator &generator = + shimTileToGeneratorMap[tileOp.getResult()]; + // TODO(jornt): Temporarily use channel 0 for all DMAs. This should + // return correct results for Shim channels, however, for generality + // towards other DMAs and future hardware generations, channel + // assignment should happen before BD assignemnt. This requires more + // refactoring. + std::optional bdId = generator.getAndAssignBdId(0); + rewriter.setInsertionPointAfter(tileOp); + auto bdIdOp = rewriter.create(rewriter.getUnknownLoc(), + tileOp, bdId.value()); + rewriter.setInsertionPoint(npuDmaOp); + (void)rewriter.replaceOpWithNewOp( + npuDmaOp, npuDmaOp.getDma(), npuDmaOp.getTargetMixedOffsets(), + npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), + npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), + npuDmaOp.getSourceMixedStrides(), bdIdOp, npuDmaOp.getSourceBdId()); + } + return WalkResult::advance(); + } else if (auto npuWaitOp = dyn_cast(op)) { + // Release BD ID used by input DMA op. + AMDAIE::NpuDmaCpyNdOp npuDmaOp = npuWaitOp.getDmaOp(); + AMDAIE::BdIdOp bdIdOp; + if (npuDmaOp.getSourceBdId()) { + bdIdOp = cast(npuDmaOp.getSourceBdId().getDefiningOp()); + } else if (npuDmaOp.getTargetBdId()) { + bdIdOp = cast(npuDmaOp.getTargetBdId().getDefiningOp()); + } else { + return WalkResult::advance(); + } + if (!bdIdOp) return WalkResult::advance(); + auto tileOp = dyn_cast(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + bdIdOp.emitOpError() << "doesn't operate on a `amdaie.tile` operation"; + return WalkResult::interrupt(); + } + if (!shimTileToGeneratorMap.contains(tileOp.getResult())) { + bdIdOp.emitOpError() + << "no BD ID generator found for this BD ID op's tile"; + return WalkResult::interrupt(); + } + ChannelBdIdGenerator &generator = + shimTileToGeneratorMap[tileOp.getResult()]; + uint32_t value = bdIdOp.getValue(); + generator.releaseBdId(value); + return WalkResult::advance(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +class AMDAIEAssignNpuDmaBdIdsPass + : public impl::AMDAIEAssignNpuDmaBdIdsBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIEAssignNpuDmaBdIdsPass() = default; + AMDAIEAssignNpuDmaBdIdsPass(const AMDAIEAssignNpuDmaBdIdsPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEAssignNpuDmaBdIdsPass::runOnOperation() { + Operation *parentOp = getOperation(); + + WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { + if (failed(assignNpuDmaBdIds(workgroupOp))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEAssignNpuDmaBdIdsPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp index 2394948b6..0b0649eae 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECreateAIEWorkgroup.cpp @@ -129,7 +129,7 @@ LogicalResult WorkgroupBuilder::buildForDmaCpyNdOp( auto ipuDmaCpy = controlCodeRewriter.createAndLookup( loc, newDmaOp.getResult(), ipuDmaTargetOffsets, ipuDmaTargetSizes, ipuDmaTargetStrides, ipuDmaSourceOffsets, ipuDmaSourceSizes, - ipuDmaSourceStrides); + ipuDmaSourceStrides, nullptr, nullptr); DMAChannelDir direction = !sourceMemSpace ? DMAChannelDir::MM2S : DMAChannelDir::S2MM; controlCodeRewriter.createAndLookup( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 68cfc51d1..df9b63592 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -438,6 +438,10 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, rewriter.setInsertionPoint(dmaOp); // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves. if (dmaOp.hasSourceAddressing() || dmaOp.getSourceMemorySpaceAsUInt() == 0) { + AMDAIE::BdIdOp bdIdOp = dmaOp.getSourceBdIdOp(); + if (!bdIdOp) + return dmaOp.emitOpError() << "expected to have a source BD ID op"; + // DmaOp either has explicit source addressing OR the defining op of its // source has its source on L3. SmallVector empty; @@ -470,14 +474,17 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, return dmaOp.emitError() << "input isn't mapped to an `aie.objectifo` operation"; } - // TODO(jornt): use bd_id != 0 bool issueToken = dmaOp.hasDmaWaitOpUser(); rewriter.create( rewriter.getUnknownLoc(), SmallVector{}, 0, 0, memref, empty, empty, empty, staticOffsets, staticSizes, staticStrides, - objFifo.getName(), 0, issueToken); + objFifo.getName(), bdIdOp.getValue(), issueToken); } if (dmaOp.hasTargetAddressing() || dmaOp.getTargetMemorySpaceAsUInt() == 0) { + AMDAIE::BdIdOp bdIdOp = dmaOp.getTargetBdIdOp(); + if (!bdIdOp) + return dmaOp.emitOpError() << "expected to have a target BD ID op"; + // DmaOp either has explicit target addressing OR the defining op of its // source has its target on L3. SmallVector empty; @@ -510,11 +517,10 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter, << "input isn't mapped to an `aie.objectifo` operation"; } bool issueToken = dmaOp.hasDmaWaitOpUser(); - // TODO(jornt): use bd_id != 0 rewriter.create( rewriter.getUnknownLoc(), SmallVector{}, 0, 0, memref, empty, empty, empty, staticOffsets, staticSizes, staticStrides, - objFifo.getName(), 0, issueToken); + objFifo.getName(), bdIdOp.getValue(), issueToken); } toBeErased.push_back(dmaOp); return success(); @@ -669,6 +675,11 @@ LogicalResult workgroupToAIE(IRRewriter &rewriter, int dmaId = 0; WalkResult res = workgroupOp.walk([&](Operation *op) { return TypeSwitch(op) + .Case([&](auto bdIdOp) { + // BD ID ops are purely used for retrieving information in other ops + // so don't convert to AIE dialect. + return WalkResult::advance(); + }) .Case([&](auto dmaOp) { if (failed(circularDmaToAIE(rewriter, dmaOp, mapper, deviceBlock, dmaId))) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp index 20edef734..5b1d71af4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp @@ -22,6 +22,12 @@ std::optional getConfigAMDAIEDevice( return AMDAIE::symbolizeEnum(attr.value().getValue()); } +std::optional getConfigAMDAIEDevice(Operation *op) { + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op); + if (!targetAttr) return std::nullopt; + return getConfigAMDAIEDevice(targetAttr); +} + namespace { /// Generate a DenseMap key we can use for the element types (alternatives diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h index cab6e4e1e..1350bb4d9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h @@ -20,6 +20,10 @@ namespace mlir::iree_compiler::AMDAIE { std::optional getConfigAMDAIEDevice( IREE::HAL::ExecutableTargetAttr targetAttr); +/// Returns the AMDAIE device from an operation. Looks for an executable target +/// attr in the AST. +std::optional getConfigAMDAIEDevice(Operation *op); + // This function is based on the following table pulled from the // AIEVec_MatMulOp documentation in // mlir-aie/include/aie/Dialect/AIEVec/IR/AIEVecOps.td diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 5ffacc1c0..6bf4d174f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -46,6 +46,7 @@ iree_cc_library( "AMDAIEAccessToAcquireRelease.cpp" "AMDAIEAddLoweringStrategy.cpp" "AMDAIEAIRDmaToAMDAIEDma.cpp" + "AMDAIEAssignNpuDmaBdIds.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeDma.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" @@ -96,6 +97,8 @@ iree_cc_library( iree::compiler::Dialect::LinalgExt::IR iree::compiler::Dialect::LinalgExt::Transforms iree::compiler::Utils + iree-amd-aie::aie_runtime::iree_aie_runtime_static + iree-amd-aie::aie_runtime::Utils iree::target::amd-aie::IR::AMDAIEDialect iree::target::amd-aie::aie::AIEDialectIR iree::target::amd-aie::aie::AIEPasses diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 1b1d3276f..4ac345def 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -22,6 +22,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DECL #define GEN_PASS_DEF_AMDAIEACCESSTOACQUIRERELEASE #define GEN_PASS_DEF_AMDAIEAIRDMATOAMDAIEDMA +#define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDMA diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 88b651b10..24c8cfcad 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -587,6 +587,14 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEDmaLoopSubsumptionPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); + + passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEControlCodeLoopUnrollPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 615b412b5..e7871c529 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -56,6 +56,9 @@ std::unique_ptr createAMDAIEAccessToAcquireReleasePass(); /// logical objectFifos. std::unique_ptr createAMDAIEAIRDmaAMDAIEDmaPass(); +/// Create a pass to assign BD ids to `amdaie.npu.dma_cpy_nd` operations. +std::unique_ptr createAMDAIEAssignNpuDmaBdIdsPass(); + /// Create a pass to do some rewrites that help bridging the path to AIR/AIE /// lowering. std::unique_ptr createAMDAIEBridgeToAIRPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 1c2fd2faa..9f29ff208 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -23,6 +23,12 @@ def AMDAIEAIRDmaToAMDAIEDma : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAIRDmaAMDAIEDmaPass()"; } +def AMDAIEAssignNpuDmaBdIds : + Pass<"iree-amdaie-assign-npu-dma-bd-ids", ""> { + let summary = "Assign BD ids to `amdaie.npu.dma_cpy_nd` operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignNpuDmaBdIdsPass()"; +} + def AMDAIEBridgeToAIR : Pass<"iree-amdaie-bridge-to-air", ""> { let summary = "Perform transformations that allow hooking into AIR/AIE lowering"; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEBridgeToAIRPass()"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index ca0ada6b3..640cfc950 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -11,6 +11,7 @@ iree_lit_test_suite( "access_to_acquire_release.mlir" "aie_link_executables.mlir" "air_dma_to_amdaie_dma.mlir" + "assign_npu_dma_bd_ids.mlir" "bridge_to_air.mlir" "bufferize_to_allocation.mlir" "canonicalize_dma.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir new file mode 100644 index 000000000..8cb4ba4e7 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir @@ -0,0 +1,292 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-assign-npu-dma-bd-ids,canonicalize,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +module { + // expected-error @+1 {{could not find an AMDAIEDevice attribute}} + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } +} + +// ----- + +// CHECK-LABEL: @single_dma_cpy_nd_on_source +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @single_dma_cpy_nd_on_source(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + amdaie.npu.dma_wait(%1, MM2S) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @single_dma_cpy_nd_on_target +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @single_dma_cpy_nd_on_target(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %0 = amdaie.circular_dma_cpy_nd(%from_memref_0[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) + amdaie.npu.dma_wait(%1, S2MM) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @multiple_dma_cpy_on_diff_tiles +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[C2:.+]] = arith.constant 2 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) +// CHECK: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @multiple_dma_cpy_on_diff_tiles(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_2_0 = amdaie.tile(%c2, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0] [8, 16] [16, 1]) + %2 = amdaie.npu.dma_cpy_nd %dma2([] [] [], [0] [128] [1]) + amdaie.npu.dma_wait(%0, MM2S) + amdaie.npu.dma_wait(%1, MM2S) + amdaie.npu.dma_wait(%2, MM2S) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @multiple_dma_cpy_with_bd_id_reuse +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_0]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @multiple_dma_cpy_with_bd_id_reuse(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + amdaie.npu.dma_wait(%1, MM2S) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [8, 16] [16, 1]) + amdaie.npu.dma_wait(%2, MM2S) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [128] [1]) + amdaie.npu.dma_wait(%3, MM2S) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: amdaie.workgroup +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1) +// CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], 2) +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_2]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @multiple_dma_cpy_with_diff_bd_id(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %0 = amdaie.circular_dma_cpy_nd(%from_memref_1[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [8, 16] [16, 1]) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [128] [1]) + amdaie.npu.dma_wait(%1, MM2S) + amdaie.npu.dma_wait(%2, MM2S) + amdaie.npu.dma_wait(%3, MM2S) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @nested_loops +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[C2:.+]] = arith.constant 2 : index +// CHECK: %[[C6:.+]] = arith.constant 6 : index +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1) +// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) +// CHECK-DAG: %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 1) +// CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) +// CHECK-DAG: %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) +// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) +// CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) +// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) +// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([] [] [], [0] [128] [1] bd_id = %[[BD_ID_0_1]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]], MM2S) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_3]], MM2S) +// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [] [] [] bd_id = %[[BD_ID_2_0]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_4]], MM2S) +// CHECK: } +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]], MM2S) +// CHECK: } +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]], MM2S) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @nested_loops(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_2_0 = amdaie.tile(%c2, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_1_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_2_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma1 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma2 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %from_memref_2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %0 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) + scf.forall (%arg4, %arg5) in (2, 2) { + %1 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) + scf.for %arg6 = %c0 to %c6 step %c1 { + %2 = amdaie.npu.dma_cpy_nd %dma1([] [] [], [0, 0] [1, 128] [128, 1]) + %3 = amdaie.npu.dma_cpy_nd %dma0([] [] [], [0] [128] [1]) + amdaie.npu.dma_wait(%2, MM2S) + amdaie.npu.dma_wait(%3, MM2S) + %4 = amdaie.npu.dma_cpy_nd %dma2([] [] [], [] [] []) + amdaie.npu.dma_wait(%4, MM2S) + } + amdaie.npu.dma_wait(%1, MM2S) + } + amdaie.npu.dma_wait(%0, MM2S) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index 2b214e363..2f7b56499 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -315,6 +315,44 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @invalid_npu_dma_cpy_nd() { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64xi32> + memref.assume_alignment %2, 64 : memref<32x64xi32> + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %alloc_1 = memref.alloc() : memref<32x32xi32, 1> + %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> + %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> + %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> + %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %dma_target_l3 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.logicalobjectfifo.link[%dma0] -> [%dma_target_l3] () + memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> + memref.dealloc %alloc_1 : memref<32x32xi32, 1> + // expected-error @+1 {{could not convert to AIEDialect ops}} + amdaie.controlcode { + // expected-error @+1 {{op expected to have a target BD ID op}} + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) + amdaie.npu.dma_wait(%npu_dma_0, S2MM) + amdaie.end + } + } + return + } +} + +// ----- + // Test to show mix of implicit/explicit source/target addressing in amdaie.npu.dma_cpy_nd. // CHECK: aie.device @@ -360,6 +398,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) + %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> @@ -373,13 +412,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1], [] [] []) + %npu_dma_0 = amdaie.npu.dma_cpy_nd %dma_target_l3([%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0, [] [] []) amdaie.npu.dma_wait(%npu_dma_0, S2MM) - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [], [] [] []) + %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [] bd_id = %bd_id_0, [] [] []) amdaie.npu.dma_wait(%npu_dma_1, S2MM) - %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1]) + %npu_dma_2 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) amdaie.npu.dma_wait(%npu_dma_2, MM2S) - %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [] [] []) + %npu_dma_3 = amdaie.npu.dma_cpy_nd %dma_source_l3([] [] [], [] [] [] bd_id = %bd_id_0) amdaie.npu.dma_wait(%npu_dma_3, MM2S) amdaie.end @@ -407,6 +446,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) + %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x16x64x128x32xi32> -> !amdaie.logicalobjectfifo> @@ -420,7 +460,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // expected-error @+1 {{could not convert to AIEDialect ops}} amdaie.controlcode { // expected-error @+1 {{implicit source/target L3 memref has rank greater than the expected static offsets/sizes/strides rank (4)}} - %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [], [] [] []) + %npu_dma_1 = amdaie.npu.dma_cpy_nd %dma_target_l3([] [] [] bd_id = %bd_id_0, [] [] []) amdaie.npu.dma_wait(%npu_dma_1, S2MM) amdaie.end @@ -494,6 +534,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) %tile_1_2 = amdaie.tile(%c1, %c2) + %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64xi32> memref.assume_alignment %0, 64 : memref<32x64xi32> %alloc_1 = memref.alloc() : memref<32x32xi32, 1> @@ -525,7 +566,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> memref.dealloc %alloc_1 : memref<32x32xi32, 1> amdaie.controlcode { - %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1]) + %npu_dma = amdaie.npu.dma_cpy_nd %dma0([] [] [], [%c0, %c32] [%c32, %c32] [%c64, %c1] bd_id = %bd_id_0) amdaie.npu.dma_wait(%npu_dma, MM2S) amdaie.end } diff --git a/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt index 8959c3b09..2154bb89d 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/aie_runtime/CMakeLists.txt @@ -490,6 +490,8 @@ endif() # iree-aie-runtime-static # ############################################################################## +iree_add_all_subdirs() + iree_tablegen_library( NAME AMDAIEEnumsGen @@ -548,5 +550,3 @@ target_link_libraries(iree-amd-aie_aie_runtime_iree_aie_runtime_static # consumers (like tests) to link individually target_link_libraries(iree-amd-aie_aie_runtime_iree_aie_runtime_static PUBLIC LLVMSupport) - -add_subdirectory(test) diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt new file mode 100644 index 000000000..c3f6c0e26 --- /dev/null +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(IREE_PACKAGE_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}") +set(IREE_PACKAGE_ROOT_PREFIX "iree-amd-aie::aie_runtime::Utils") +iree_add_all_subdirs() + +iree_cc_library( + NAME + Utils + HDRS + "ChannelBdIdGenerator.h" + SRCS + "ChannelBdIdGenerator.cpp" + DEPS + LLVMSupport + MLIRIR + MLIRParser + MLIRSupport + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp new file mode 100644 index 000000000..339146e5f --- /dev/null +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp @@ -0,0 +1,27 @@ +// Copyright 2020 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" + +namespace mlir::iree_compiler::AMDAIE { + +std::optional ChannelBdIdGenerator::getAndAssignBdId( + uint32_t channel) { + if (!channelToValidBdIds.contains(channel) || + channelToValidBdIds[channel].empty()) { + return std::nullopt; + } + uint32_t bdId = channelToValidBdIds[channel][0]; + size_t index{1}; + while (isBdIdAssigned(bdId) && index < channelToValidBdIds[channel].size()) { + bdId = channelToValidBdIds[channel][index++]; + } + if (isBdIdAssigned(bdId)) return std::nullopt; + assignBdId(bdId); + return bdId; +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h new file mode 100644 index 000000000..e478ac155 --- /dev/null +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h @@ -0,0 +1,53 @@ +// Copyright 2020 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_COMPILER_AMDAIE_UTILS_CHANNEL_BD_ID_GENERATOR_H_ +#define IREE_COMPILER_AMDAIE_UTILS_CHANNEL_BD_ID_GENERATOR_H_ + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "mlir/Support/LogicalResult.h" + +using namespace llvm; + +namespace mlir::iree_compiler::AMDAIE { + +/// Utility to generate valid buffer descriptor (BD) ids for channels. Keeps +/// state on assigned BD ids to avoid reuse. +class ChannelBdIdGenerator { + public: + ChannelBdIdGenerator() {} + ChannelBdIdGenerator( + const DenseMap> &channelToValidBdIds) + : channelToValidBdIds(channelToValidBdIds) {} + ChannelBdIdGenerator( + DenseMap> &&channelToValidBdIds) + : channelToValidBdIds(std::move(channelToValidBdIds)) {} + + void assignBdId(uint32_t bdId) { assignedBdIds.insert(bdId); } + + /// Attempts to find and assign an unused BD id for the provided channel. + /// Returns `std::nullopt` if no valid BD id could be found. + std::optional getAndAssignBdId(uint32_t channel); + + /// Check whether the provided BD id is currently assigned. + bool isBdIdAssigned(uint32_t bdId) const { return assignedBdIds.count(bdId); } + + /// Releases the provided BD id if it is currently assigned so it can be + /// reused. + void releaseBdId(uint32_t bdId) { assignedBdIds.erase(bdId); } + + private: + // Maps channel indices to vectors of valid BD ids. + DenseMap> channelToValidBdIds; + // Set with all BD ids that are currently assigned. + DenseSet assignedBdIds; +}; + +} // namespace mlir::iree_compiler::AMDAIE + +#endif // IREE_COMPILER_AMDAIE_UTILS_CHANNEL_BD_ID_GENERATOR_H_ diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/test/CMakeLists.txt b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/CMakeLists.txt new file mode 100644 index 000000000..c430bb7be --- /dev/null +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_add_all_subdirs() + +iree_cc_test( + NAME + ChannelBdIdGeneratorTest + SRCS + "ChannelBdIdGeneratorTest.cpp" + DEPS + gtest + iree-amd-aie::aie_runtime::Utils::Utils +) diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp new file mode 100644 index 000000000..3b05cf19d --- /dev/null +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp @@ -0,0 +1,112 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +#include + +#include "gtest/gtest.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" + + +namespace { + + +using namespace mlir::iree_compiler::AMDAIE; + + +DenseMap> +getTestSingleRangeChannelToValidBdIds() { + SmallVector range(3); + std::iota(range.begin(), range.end(), 0); + DenseMap> channelToValidBdIds = {{0, range}, + {1, range}}; + return channelToValidBdIds; +} + + +DenseMap> getTestEvenOddChannelToValidBdIds() { + SmallVector evenRange(4); + std::iota(evenRange.begin(), evenRange.end(), 0); + SmallVector oddRange(4); + std::iota(oddRange.begin(), oddRange.end(), 4); + DenseMap> channelToValidBdIds = { + {0, evenRange}, {1, oddRange}, {2, evenRange}, + {3, oddRange}, {4, evenRange}, {5, oddRange}}; + return channelToValidBdIds; +} + + +TEST(ChannelBdIdGeneratorTest, SingleRange) { + ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0); + EXPECT_EQ(generator.isBdIdAssigned(0), true); + EXPECT_EQ(generator.getAndAssignBdId(1).value(), 1); + EXPECT_EQ(generator.isBdIdAssigned(1), true); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 2); + EXPECT_EQ(generator.isBdIdAssigned(2), true); + EXPECT_EQ(generator.getAndAssignBdId(1), std::nullopt); +} + + +TEST(ChannelBdIdGeneratorTest, EvenOdd) { + ChannelBdIdGenerator generator(getTestEvenOddChannelToValidBdIds()); + // Check that even channel BDs start from 0 + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0); + EXPECT_EQ(generator.isBdIdAssigned(0), true); + // Check that odd channel BDs start from 4 + EXPECT_EQ(generator.getAndAssignBdId(1).value(), 4); + EXPECT_EQ(generator.isBdIdAssigned(4), true); + // Check assignment of other even BDs + EXPECT_EQ(generator.getAndAssignBdId(2).value(), 1); + EXPECT_EQ(generator.isBdIdAssigned(1), true); + EXPECT_EQ(generator.getAndAssignBdId(4).value(), 2); + EXPECT_EQ(generator.isBdIdAssigned(2), true); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 3); + EXPECT_EQ(generator.isBdIdAssigned(3), true); + EXPECT_EQ(generator.getAndAssignBdId(2), std::nullopt); + // Check assignment of other odd BDs + EXPECT_EQ(generator.getAndAssignBdId(3).value(), 5); + EXPECT_EQ(generator.isBdIdAssigned(5), true); + EXPECT_EQ(generator.getAndAssignBdId(5).value(), 6); + EXPECT_EQ(generator.isBdIdAssigned(6), true); + EXPECT_EQ(generator.getAndAssignBdId(1).value(), 7); + EXPECT_EQ(generator.isBdIdAssigned(7), true); + EXPECT_EQ(generator.getAndAssignBdId(3), std::nullopt); +} + + +TEST(ChannelBdIdGeneratorTest, AssignBdId) { + ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); + generator.assignBdId(0); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 1); + EXPECT_EQ(generator.isBdIdAssigned(1), true); + generator.assignBdId(2); + EXPECT_EQ(generator.getAndAssignBdId(1), std::nullopt); +} + + +TEST(ChannelBdIdGeneratorTest, Release) { + ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0); + EXPECT_EQ(generator.isBdIdAssigned(0), true); + generator.releaseBdId(0); + EXPECT_EQ(generator.getAndAssignBdId(1).value(), 0); + EXPECT_EQ(generator.isBdIdAssigned(0), true); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 1); + EXPECT_EQ(generator.isBdIdAssigned(1), true); + generator.releaseBdId(1); + EXPECT_EQ(generator.getAndAssignBdId(1).value(), 1); + EXPECT_EQ(generator.isBdIdAssigned(1), true); +} + + +} // namespace + + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index e694cda5d..efc59db3d 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -7,6 +7,7 @@ #include "iree_aie_runtime.h" #include +#include #include "llvm/ADT/StringExtras.h" @@ -468,6 +469,32 @@ uint32_t AMDAIEDeviceModel::getColumnShift() const { } uint32_t AMDAIEDeviceModel::getRowShift() const { return configPtr.RowShift; } +DenseMap> +AMDAIEDeviceModel::getChannelToValidBdIds(AMDAIETileType tileType) const { + switch (tileType) { + case AMDAIETileType::MEMTILE: { + SmallVector evenRange(24); + std::iota(evenRange.begin(), evenRange.end(), 0); + SmallVector oddRange(24); + std::iota(oddRange.begin(), oddRange.end(), 24); + DenseMap> channelToValidBdIds = { + {0, evenRange}, {1, oddRange}, {2, evenRange}, + {3, oddRange}, {4, evenRange}, {5, oddRange}}; + return channelToValidBdIds; + } + case AMDAIETileType::SHIMNOC: { + SmallVector range(16); + std::iota(range.begin(), range.end(), 0); + DenseMap> channelToValidBdIds = { + {0, range}, {1, range}}; + return channelToValidBdIds; + } + default: + break; + } + llvm::report_fatal_error("Unhandled AMDAIETileType case"); +} + struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index b5a727685..37730c61e 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -294,6 +294,11 @@ struct AMDAIEDeviceModel { uint32_t getColumnShift() const; uint32_t getRowShift() const; + + /// Return a map from channels to valid BD ids for the requested tile type. + /// TODO(jornt): find these ranges in the device model. + DenseMap> getChannelToValidBdIds( + AMDAIETileType tileType) const; }; struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device); diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir index 2d6be1d88..7651e484b 100644 --- a/tests/samples/matmul_peeled_objectfifo.mlir +++ b/tests/samples/matmul_peeled_objectfifo.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s // CHECK: aie.device(npu1_4col) // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) diff --git a/tests/samples/matmul_peeled_objectfifo_e2e.mlir b/tests/samples/matmul_peeled_objectfifo_e2e.mlir index 9cafe77f5..471c55262 100644 --- a/tests/samples/matmul_peeled_objectfifo_e2e.mlir +++ b/tests/samples/matmul_peeled_objectfifo_e2e.mlir @@ -12,11 +12,11 @@ // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) // CHECK-DAG: func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32(%[[ARG0:.+]]: memref<128x256xi32>, %[[ARG1:.+]]: memref<256x128xi32>, %[[ARG2:.+]]: memref<128x128xi32>) -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 64, 32][0, 0, 256, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ0:.+]]} +// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][2, 2, 64, 64][8192, 64, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]} +// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 64, 32][0, 0, 256, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ0:.+]]} // CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ0]]} -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 2, 32, 32][0, 32, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ1:.+]]} +// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 2, 32, 32][0, 32, 128, 1]) {id = 1 : i64, issue_token = true, metadata = @[[OBJ1:.+]]} // CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ1]]} -// CHECK-DAG: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][1, 1, 64, 64][0, 0, 128, 1]) {id = 0 : i64, issue_token = true, metadata = @[[OBJ10:.+]]} // CHECK-DAG: aiex.npu.dma_wait {symbol = @[[OBJ10]]} // CHECK-DAG: aie.shim_dma_allocation @[[OBJ0]](MM2S, 0, 0) // CHECK-DAG: aie.shim_dma_allocation @[[OBJ1]](MM2S, 1, 0)