diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp new file mode 100644 index 000000000..6ba9acaec --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineLogicalObjFifosForConnectionReuse.cpp @@ -0,0 +1,50 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/IR/Iterators.h" +#include "mlir/Pass/Pass.h" + +#define DEBUG_TYPE "iree-amdaie-combine-logical-objectfifos-for-connection-reuse" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIECombineLogicalObjFifosForConnectionReusePass + : public impl::AMDAIECombineLogicalObjFifosForConnectionReuseBase< + AMDAIECombineLogicalObjFifosForConnectionReusePass> { + public: + using AMDAIECombineLogicalObjFifosForConnectionReuseBase:: + AMDAIECombineLogicalObjFifosForConnectionReuseBase; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIECombineLogicalObjFifosForConnectionReusePass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + MLIRContext *context = &getContext(); + IRRewriter rewriter(context); + + SmallVector l2ToL1DmaOps = + fetchDmaCpyNdOpsToSplitOrCombine(moduleOp); + + if (failed(combineLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) { + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr createAMDAIECombineLogicalObjFifosForConnectionReusePass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index 8a0e65e1d..0a77f17dc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -52,60 +52,6 @@ int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound, namespace { -/// Utility affine expression visitor to retrieve the scale and optional bias -/// from the expression. -struct RetrieveScaleAndBias - : public AffineExprVisitor { - std::optional scale; - std::optional bias; - LogicalResult visitAffineBinaryOpExpr(AffineBinaryOpExpr /*expr*/) { - return failure(); - } - LogicalResult visitConstantExpr(AffineConstantExpr /*expr*/) { - return failure(); - } - LogicalResult visitDimExpr(AffineDimExpr /*expr*/) { return failure(); } - LogicalResult visitSymbolExpr(AffineSymbolExpr /*expr*/) { return failure(); } - LogicalResult visitMulExpr(AffineBinaryOpExpr expr) { - if (auto rhsSize = dyn_cast(expr.getRHS()); - isa(expr.getLHS())) { - scale = rhsSize.getValue(); - } else if (auto lhsSize = dyn_cast(expr.getLHS()); - isa(expr.getRHS())) { - scale = lhsSize.getValue(); - } - return success(); - } - LogicalResult visitAddExpr(AffineBinaryOpExpr expr) { - if (bias) return failure(); - if (auto rhsSize = dyn_cast(expr.getRHS())) { - bias = rhsSize.getValue(); - if (bias.value() < 0) return failure(); - if (isa(expr.getLHS())) { - return visit(expr.getLHS()); - } else if (isa(expr.getLHS())) { - scale = 1; - return success(); - } else { - return failure(); - } - } else if (auto lhsSize = dyn_cast(expr.getLHS())) { - bias = lhsSize.getValue(); - if (bias.value() < 0) return failure(); - if (isa(expr.getRHS())) { - return visit(expr.getRHS()); - } else if (isa(expr.getRHS())) { - scale = 1; - return success(); - } else { - return failure(); - } - } else { - return failure(); - } - } -}; - struct SubsumeLoopIntoDMA : public OpInterfaceRewritePattern { using OpInterfaceRewritePattern::OpInterfaceRewritePattern; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h index e628cc739..81a0d6994 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h @@ -12,12 +12,71 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/SmallVector.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" namespace mlir::iree_compiler::AMDAIE { +/// Utility to retrieve a constant index from an OpFoldResult. +int64_t getConstantIndexOrAssert(OpFoldResult dim); + +/// Utility affine expression visitor to retrieve the scale and optional bias +/// from the expression. +struct RetrieveScaleAndBias + : public AffineExprVisitor { + std::optional scale; + std::optional bias; + LogicalResult visitAffineBinaryOpExpr(AffineBinaryOpExpr /*expr*/) { + return failure(); + } + LogicalResult visitConstantExpr(AffineConstantExpr /*expr*/) { + return failure(); + } + LogicalResult visitDimExpr(AffineDimExpr /*expr*/) { return failure(); } + LogicalResult visitSymbolExpr(AffineSymbolExpr /*expr*/) { return failure(); } + LogicalResult visitMulExpr(AffineBinaryOpExpr expr) { + if (auto rhsSize = dyn_cast(expr.getRHS()); + isa(expr.getLHS())) { + scale = rhsSize.getValue(); + } else if (auto lhsSize = dyn_cast(expr.getLHS()); + isa(expr.getRHS())) { + scale = lhsSize.getValue(); + } + return success(); + } + LogicalResult visitAddExpr(AffineBinaryOpExpr expr) { + if (bias) return failure(); + if (auto rhsSize = dyn_cast(expr.getRHS())) { + bias = rhsSize.getValue(); + if (bias.value() < 0) return failure(); + if (isa(expr.getLHS())) { + return visit(expr.getLHS()); + } else if (isa(expr.getLHS())) { + scale = 1; + return success(); + } else { + return failure(); + } + } else if (auto lhsSize = dyn_cast(expr.getLHS())) { + bias = lhsSize.getValue(); + if (bias.value() < 0) return failure(); + if (isa(expr.getRHS())) { + return visit(expr.getRHS()); + } else if (isa(expr.getRHS())) { + scale = 1; + return success(); + } else { + return failure(); + } + } else { + return failure(); + } + } +}; + // Constant specifying the number of inter-iteration dimension for DMA // operations. // @@ -194,9 +253,9 @@ struct DmaDimConfig { AMDAIE::AMDAIETileType sourceTileType; AMDAIE::AMDAIETileType targetTileType; /// The maximum number of addressing dimensions on the source side of the DMA. - uint8_t sourceMaxNbDims{0}; + int64_t sourceMaxNbDims{0}; /// The maximum number of addressing dimensions on the target side of the DMA. - uint8_t targetMaxNbDims{0}; + int64_t targetMaxNbDims{0}; DmaDimConfig(const AMDAIE::AMDAIEDeviceModel &deviceModel, uint8_t sourceMemspaceInt, uint8_t targetMemspaceInt) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp index 6b2fda49e..2ebbabda6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -8,6 +8,7 @@ #include +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/Debug.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" @@ -17,10 +18,66 @@ #include "mlir/IR/Iterators.h" #include "mlir/IR/Operation.h" +/////////////////////////////////////////////////// +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +/////////////////////////////////////////////////// + #define DEBUG_TYPE "iree-amdaie-logicalobjfifo-splitting-utils" namespace mlir::iree_compiler::AMDAIE { +/// Utility to create a new logical objectfifo based on shape defined by +/// `newSizesOpFoldResultArr`. +static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo( + IRRewriter &rewriter, + AMDAIE::LogicalObjectFifoFromMemrefOp &oldLogicalObjectFifo, + SmallVector &newSizesOpFoldResultArr) { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector newSizes; + for (OpFoldResult sizeVal : newSizesOpFoldResultArr) { + newSizes.push_back(getConstantIndexOrAssert(sizeVal)); + } + Value oldAllocOp = oldLogicalObjectFifo.getMemref(); + auto oldMemRefType = cast(oldAllocOp.getType()); + MemRefType newAllocType = MemRefType::get( + newSizes, oldMemRefType.getElementType(), MemRefLayoutAttrInterface{}, + oldMemRefType.getMemorySpace()); + assert(oldAllocOp.getDefiningOp() && "expected a defining op for the value"); + rewriter.setInsertionPoint(oldAllocOp.getDefiningOp()); + auto newAllocOp = + rewriter.create(rewriter.getUnknownLoc(), newAllocType); + auto newDeallocOp = + rewriter.create(rewriter.getUnknownLoc(), newAllocOp); + newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); + auto type = cast(newAllocOp.getType()); + // Create new logical objectfifo. + rewriter.setInsertionPoint(oldLogicalObjectFifo); + auto newLogicalObjectFifo = + rewriter.create( + rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), + newAllocOp.getResult(), oldLogicalObjectFifo.getTiles()); + return newLogicalObjectFifo; +} + +/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. +SmallVector fetchDmaCpyNdOpsToSplitOrCombine( + ModuleOp moduleOp) { + SmallVector l2ToL1DmaOps; + // We are currently walking through CoreOps gathering 3rd Input DmaOp (if + // applicable) from them. + // TODO(avarma): We will generalize this later. + moduleOp.walk([&](AMDAIE::CoreOp coreOp) { + SmallVector inputDmas = coreOp.getInputDmas(); + if (inputDmas.size() != 3) return WalkResult::skip(); + auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); + assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op"); + l2ToL1DmaOps.push_back(dmaCpyNdOp); + return WalkResult::advance(); + }); + return l2ToL1DmaOps; +} + /// Utility to verify that the split dimensions for L2 are contiguous. static LogicalResult checkIsRangeFromZero( SmallVector &splitDimsSetForL2) { @@ -124,6 +181,44 @@ static FailureOr updateL3SourceOffset(IRRewriter &rewriter, return newL3AsSourceOffset; } +/// Given a L2->L1 DmaCpyNd op, find the unique L3->L2 DmaCpyNd op. +static FailureOr fetchL3ToL2DmaCpyNdOp( + AMDAIE::DmaCpyNdOp l2ToL1DmaOp) { + LogicalObjectFifoFromMemrefOp sourceObjectFifo = + l2ToL1DmaOp.getSourceObjectFifo(); + SmallVector l3ToL2DmaOps; + AMDAIE::DmaCpyNdOp l3ToL2DmaOp; + for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(objFifoUserOp); + dmaOp.getTargetObjectFifo() == sourceObjectFifo) { + l3ToL2DmaOps.push_back(dmaOp); + } + } + if (l3ToL2DmaOps.size() == 0) { + LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for " + << sourceObjectFifo << "\n"); + return failure(); + } + if (l3ToL2DmaOps.size() > 1) { + LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for " + << sourceObjectFifo << "\n"); + return failure(); + } + l3ToL2DmaOp = l3ToL2DmaOps[0]; + if ((l3ToL2DmaOp.getTargetMixedOffsets().size() != + l3ToL2DmaOp.getSourceMixedOffsets().size()) || + (l3ToL2DmaOp.getTargetMixedSizes().size() != + l3ToL2DmaOp.getSourceMixedSizes().size()) || + (l3ToL2DmaOp.getTargetMixedStrides().size() != + l3ToL2DmaOp.getSourceMixedStrides().size())) { + LLVM_DEBUG(llvm::dbgs() << "dimensionality of source and target's " + "offset/size/stride found different for " + << l3ToL2DmaOp << "\n"); + return failure(); + } + return l3ToL2DmaOp; +} + /// A struct utility to encapsulate all the data required to perform splitting /// of logicalobjectfifos. struct SplittingLogicalObjectFifoData { @@ -186,36 +281,10 @@ static LogicalResult checkWhetherSplitIsPossible( } // Fetch the L3 -> L2 Dma Op corresponding to the L2 buffer as target. - SmallVector l3ToL2DmaOps; - AMDAIE::DmaCpyNdOp l3ToL2DmaOp; - for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) { - if (auto dmaOp = dyn_cast(objFifoUserOp); - dmaOp.getTargetObjectFifo() == sourceObjectFifo) { - l3ToL2DmaOps.push_back(dmaOp); - } - } - if (l3ToL2DmaOps.size() == 0) { - LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for " - << sourceObjectFifo << "\n"); - return failure(); - } - if (l3ToL2DmaOps.size() > 1) { - LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for " - << sourceObjectFifo << "\n"); - return failure(); - } - l3ToL2DmaOp = l3ToL2DmaOps[0]; - if ((l3ToL2DmaOp.getTargetMixedOffsets().size() != - l3ToL2DmaOp.getSourceMixedOffsets().size()) || - (l3ToL2DmaOp.getTargetMixedSizes().size() != - l3ToL2DmaOp.getSourceMixedSizes().size()) || - (l3ToL2DmaOp.getTargetMixedStrides().size() != - l3ToL2DmaOp.getSourceMixedStrides().size())) { - LLVM_DEBUG(llvm::dbgs() << "dimensionality of source and target's " - "offset/size/stride found different for " - << l3ToL2DmaOp << "\n"); - return failure(); - } + FailureOr maybeL3ToL2DmaOp = + fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]); + if (failed(maybeL3ToL2DmaOp)) return failure(); + AMDAIE::DmaCpyNdOp l3ToL2DmaOp = maybeL3ToL2DmaOp.value(); SmallVector staticL2AsTargetSizes = l3ToL2DmaOp.getTargetMixedSizes(); @@ -289,16 +358,13 @@ LogicalResult splitLogicalObjectFifos( toBeErased.insert(sourceAllocOp); toBeErased.insert(sourceObjectFifo); - SmallVector staticL2AsTargetOffsets = + SmallVector staticL2AsTargetOffsets = l3ToL2DmaOp.getTargetMixedOffsets(); - SmallVector staticL2AsTargetSizes = + SmallVector staticL2AsTargetSizes = l3ToL2DmaOp.getTargetMixedSizes(); - SmallVector l2ShapeAsTarget = llvm::to_vector( - cast(l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType()) - .getShape()); - SmallVector staticL3AsSourceOffsets = + SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); - SmallVector staticL3AsSourceSizes = + SmallVector staticL3AsSourceSizes = l3ToL2DmaOp.getSourceMixedSizes(); OpFoldResult zeroVal = getAsIndexOpFoldResult(context, 0); OpFoldResult oneVal = getAsIndexOpFoldResult(context, 1); @@ -310,45 +376,28 @@ LogicalResult splitLogicalObjectFifos( staticL2AsTargetSizes[dim] = oneVal; staticL3AsSourceOffsets[dim] = zeroVal; staticL3AsSourceSizes[dim] = oneVal; - l2ShapeAsTarget[dim] = 1; } // Traverse each L2->L1 DmaCpyNd op and split them. for (AMDAIE::DmaCpyNdOp l2ToL1DmaOp : l2ToL1DmaOps) { - SmallVector staticL2AsSourceOffsets = + SmallVector staticL2AsSourceOffsets = l2ToL1DmaOp.getSourceMixedOffsets(); - SmallVector staticL2AsSourceSizes = + SmallVector staticL2AsSourceSizes = l2ToL1DmaOp.getSourceMixedSizes(); // Now we'll create a new L2 buffer based on the new shape inferred earlier - // via `l2ShapeAsTarget`. - rewriter.setInsertionPoint(sourceAllocOp); - LogicalObjectFifoFromMemrefOp targetObjectFifo = - l2ToL1DmaOp.getTargetObjectFifo(); - Value targetAllocOp = targetObjectFifo.getMemref(); - auto oldSourceMemRefType = cast(sourceAllocOp.getType()); - auto targetMemRefType = cast(targetAllocOp.getType()); - MemRefType newAllocType = MemRefType::get( - l2ShapeAsTarget, targetMemRefType.getElementType(), - MemRefLayoutAttrInterface{}, oldSourceMemRefType.getMemorySpace()); - auto newAllocOp = rewriter.create(rewriter.getUnknownLoc(), - newAllocType); - auto newDeallocOp = rewriter.create( - rewriter.getUnknownLoc(), newAllocOp); - newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); - auto type = cast(newAllocOp.getType()); - // Create new logicalobjectfifo.from_memref for the newly created L2 buffer. - rewriter.setInsertionPoint(l2ToL1DmaOp.getSourceObjectFifo()); - auto source = rewriter.create( - rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), - newAllocOp.getResult(), sourceObjectFifo.getTiles()); + // via `staticL2AsTargetSizes`. + LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = + l2ToL1DmaOp.getSourceObjectFifo(); + AMDAIE::LogicalObjectFifoFromMemrefOp source = createNewLogicalObjectFifo( + rewriter, oldL2ObjectFifo, staticL2AsTargetSizes); // -------------------------------------------- // ---------- L3 -> L2 splitting -------------- // -------------------------------------------- // Update L3 source offsets for non-split dimensions. Refer doc comment of // `updateL3SourceOffset` for the computation rationale involved. - SmallVector staticL3AsSourceOffsets = + SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); for (auto &&[splitDim, nonSplitdim] : llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) { @@ -427,4 +476,386 @@ LogicalResult splitLogicalObjectFifos( return success(); } +static int64_t fetchOffsetBias(OpFoldResult offsetOpFoldResult) { + std::optional offset = getConstantIntValue(offsetOpFoldResult); + if (offset) return offset.value(); + auto offsetVal = cast(offsetOpFoldResult); + auto affineApplyOp = + dyn_cast_if_present(offsetVal.getDefiningOp()); + if (!affineApplyOp) return 0; + AffineMap affineMap = affineApplyOp.getAffineMap(); + RetrieveScaleAndBias retriever; + assert(!failed(retriever.visit(affineMap.getResult(0))) && + "failed to retrieve scale and bias"); + int64_t bias = 0; + if (retriever.bias) { + bias = retriever.bias.value(); + } + return bias; +} + +static LogicalResult combineL3ToL2AccessPatterns( + RewriterBase &rewriter, const SmallVector &offsetsA, + const SmallVector &sizesA, + const SmallVector &stridesA, + const SmallVector &offsetsB, + const SmallVector &sizesB, + const SmallVector &stridesB, + SmallVector &newOffsets, SmallVector &newSizes, + SmallVector &newStrides, SmallVector &splitDims, + SmallVector &nonSplitDims) { + if (offsetsA.empty() && offsetsB.empty()) return success(); + + int64_t newSize = 1; + for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) { + if (iter.index() < splitDims.size()) continue; + const OpFoldResult &offsetA = std::get<0>(iter.value()); + const OpFoldResult &offsetB = std::get<1>(iter.value()); + if (offsetA != offsetB) { + // Need to check the difference in bias here. + int64_t biasA = fetchOffsetBias(offsetA); + int64_t biasB = fetchOffsetBias(offsetB); + std::optional sizeA = getConstantIntValue(sizesA[iter.index()]); + assert(sizeA && "expected a constant integer value for size"); + assert((sizeA == biasB - biasA) && + "L3->L2 pair cannot be combined because offset is not contiguous"); + newSize++; + } + } + newSizes[splitDims.size() - 1] = rewriter.getI64IntegerAttr(newSize); + return success(); +} + +static FailureOr combineL3ToL2Pair( + IRRewriter &rewriter, DmaCpyNdOp dmaOpA, DmaCpyNdOp dmaOpB, + SmallVector &splitDims, SmallVector &nonSplitDims) { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector sourceOffsetsA = dmaOpA.getSourceMixedOffsets(); + SmallVector sourceSizesA = dmaOpA.getSourceMixedSizes(); + SmallVector sourceStridesA = dmaOpA.getSourceMixedStrides(); + SmallVector sourceOffsetsB = dmaOpB.getSourceMixedOffsets(); + SmallVector sourceSizesB = dmaOpB.getSourceMixedSizes(); + SmallVector sourceStridesB = dmaOpB.getSourceMixedStrides(); + + SmallVector targetOffsetsA = dmaOpA.getTargetMixedOffsets(); + SmallVector targetSizesA = dmaOpA.getTargetMixedSizes(); + SmallVector targetStridesA = dmaOpA.getTargetMixedStrides(); + SmallVector targetOffsetsB = dmaOpB.getTargetMixedOffsets(); + SmallVector targetSizesB = dmaOpB.getTargetMixedSizes(); + SmallVector targetStridesB = dmaOpB.getTargetMixedStrides(); + + SmallVector newSourceOffsets = sourceOffsetsA; + SmallVector newSourceSizes = sourceSizesA; + SmallVector newSourceStrides = sourceStridesA; + if (failed(combineL3ToL2AccessPatterns( + rewriter, sourceOffsetsA, sourceSizesA, sourceStridesA, + sourceOffsetsB, sourceSizesB, sourceStridesB, newSourceOffsets, + newSourceSizes, newSourceStrides, splitDims, nonSplitDims))) { + return failure(); + } + + SmallVector newTargetOffsets = targetOffsetsA; + SmallVector newTargetSizes = newSourceSizes; + SmallVector newTargetStrides = targetStridesA; + // Now we need to create a new L2 buffer based on `newTargetSizes`. + LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = dmaOpA.getTargetObjectFifo(); + AMDAIE::LogicalObjectFifoFromMemrefOp newL2ObjectFifo = + createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newTargetSizes); + + // Create combined L3->L2 Dma. + rewriter.setInsertionPoint(dmaOpA); + auto combinedL3ToL2DmaOp = rewriter.create( + dmaOpA.getLoc(), newL2ObjectFifo, llvm::ArrayRef(newTargetOffsets), + llvm::ArrayRef(newTargetSizes), llvm::ArrayRef(newTargetStrides), + dmaOpA.getSource(), llvm::ArrayRef(newSourceOffsets), + llvm::ArrayRef(newSourceSizes), llvm::ArrayRef(newSourceStrides)); + // Replace the uses of 2nd L3->L2 Dma with the new combined L3->L2 Dma + // and erase the 1st L3->L2 Dma. + rewriter.replaceOp(dmaOpB, combinedL3ToL2DmaOp); + rewriter.eraseOp(dmaOpA); + return newL2ObjectFifo; +} + +/// Utility to fetch a unique CoreOp associated with a L2->L1 Dma op. +static CoreOp fetchUniqueCoreOp(DmaCpyNdOp &l2ToL1DmaOp) { + SmallVector coreOps; + for (Operation *userOp : l2ToL1DmaOp->getUsers()) { + if (auto coreOp = dyn_cast(userOp)) { + coreOps.push_back(coreOp); + } + } + assert(coreOps.size() == 1 && + "L2->L1 Dma op expected to have a unique Core op"); + return coreOps[0]; +} + +static bool compareL3ToL2DmaPair(DmaCpyNdOp &a, DmaCpyNdOp &b) { + SmallVector sourceOffsetsA = a.getSourceMixedOffsets(); + SmallVector sourceSizesA = a.getSourceMixedSizes(); + SmallVector sourceOffsetsB = b.getSourceMixedOffsets(); + SmallVector sourceSizesB = b.getSourceMixedSizes(); + // We'll add assertion checks on the size before invoking this function. + for (int64_t i = 0, n = sourceOffsetsA.size(); i < n; i++) { + std::optional offsetA = getConstantIntValue(sourceOffsetsA[i]); + std::optional offsetB = getConstantIntValue(sourceOffsetsB[i]); + if (offsetA && offsetB) { + if (offsetA < offsetB) return true; + if (offsetA > offsetB) return false; + continue; + } + if (!offsetA && !offsetB) { + auto offsetValA = cast(sourceOffsetsA[i]); + auto offsetValB = cast(sourceOffsetsB[i]); + auto affineApplyOpA = dyn_cast_if_present( + offsetValA.getDefiningOp()); + auto affineApplyOpB = dyn_cast_if_present( + offsetValB.getDefiningOp()); + // TODO(avarma): This should be handled better. The overall possibility + // here already makes this complex enough. + assert(affineApplyOpA && "expected affine.apply op"); + assert(affineApplyOpB && "expected affine.apply op"); + for (auto &&[valA, valB] : + llvm::zip_equal(affineApplyOpA.getMapOperands(), + affineApplyOpB.getMapOperands())) { + assert((valA == valB) && + "different base values being operated on between the L3->L2 Dma " + "op pair"); + } + AffineMap affineMapA = affineApplyOpA.getAffineMap(); + AffineMap affineMapB = affineApplyOpB.getAffineMap(); + RetrieveScaleAndBias retrieverA, retrieverB; + assert(!failed(retrieverA.visit(affineMapA.getResult(0))) && + "failed to retrieve scale and bias"); + assert(!failed(retrieverB.visit(affineMapB.getResult(0))) && + "failed to retrieve scale and bias"); + int64_t biasA = 0, biasB = 0; + if (retrieverA.bias) { + biasA = retrieverA.bias.value(); + } + if (retrieverB.bias) { + biasB = retrieverB.bias.value(); + } + // TODO(avarma): We should also check the scale value as well. + if (biasA < biasB) return true; + if (biasA > biasB) return false; + continue; + } + assert(false && + "unexpected combination of offset val amongst L3->L2 Dma pair"); + } + return false; +} + +static LogicalResult checkIfSameDimensionalityAccessPatterns( + AMDAIE::DmaCpyNdOp &l3ToL2DmaOpA, AMDAIE::DmaCpyNdOp &l3ToL2DmaOpB) { + SmallVector sourceOffsetsA = + l3ToL2DmaOpA.getSourceMixedOffsets(); + SmallVector sourceSizesA = l3ToL2DmaOpA.getSourceMixedSizes(); + SmallVector sourceStridesA = + l3ToL2DmaOpA.getSourceMixedStrides(); + SmallVector sourceOffsetsB = + l3ToL2DmaOpB.getSourceMixedOffsets(); + SmallVector sourceSizesB = l3ToL2DmaOpB.getSourceMixedSizes(); + SmallVector sourceStridesB = + l3ToL2DmaOpB.getSourceMixedStrides(); + if (sourceOffsetsA.size() != sourceOffsetsB.size() || + sourceSizesA.size() != sourceSizesB.size() || + sourceStridesA.size() != sourceStridesB.size() || + sourceOffsetsA.size() != sourceSizesA.size() || + sourceOffsetsA.size() != sourceStridesB.size()) { + return failure(); + } + return success(); +} + +/// Given a vector of L2->L1 Dma Ops, combine the corresponding L3->L2 Dma Ops +/// and reuse the L2/L1 buffers. +/// TODO(avarma): Assign combined tiles while forming L2/L1 buffers which we'll +/// reuse. +LogicalResult combineLogicalObjectFifos( + IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, + MLIRContext *context) { + if (l2ToL1DmaOps.size() == 0) return success(); + + // Fetch the L3 -> L2 Dma Op corresponding to the first L2 buffer as target. + SmallVector l3ToL2DmaOps; + FailureOr maybeL3ToL2DmaOp = + fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]); + if (failed(maybeL3ToL2DmaOp)) return failure(); + l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value()); + + // Check that all L3 buffer associated with the different L3->L2 Dma ops are + // same. + for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) { + maybeL3ToL2DmaOp = fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[i]); + if (failed(maybeL3ToL2DmaOp)) return failure(); + l3ToL2DmaOps.push_back(maybeL3ToL2DmaOp.value()); + if (l3ToL2DmaOps[0].getSourceObjectFifo() != + l3ToL2DmaOps[i].getSourceObjectFifo()) { + LLVM_DEBUG(llvm::dbgs() + << "Found different L3 objectFifo for " << l3ToL2DmaOps[0] + << " and " << l3ToL2DmaOps[i] << "\n"); + return failure(); + } + if (failed(checkIfSameDimensionalityAccessPatterns(l3ToL2DmaOps[0], + l3ToL2DmaOps[i]))) { + LLVM_DEBUG(llvm::dbgs() + << "Found different dimensionality of access patterns\n"); + return failure(); + } + } + + if (l2ToL1DmaOps.size() != l3ToL2DmaOps.size()) { + LLVM_DEBUG( + llvm::dbgs() + << "expected 1:1 correspondence between L3->L2 and L2->L1 Dma ops\n"); + return failure(); + } + + // Fetch split/non-split dimensions. Currently we look for a continuous + // sequence of 0 offset dims with size as 1 to infer them as split dimensions. + DenseSet splitDimsSetForL2; + SmallVector splitDimsForL2; + size_t maxSplitDimIndex = 0; + for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i++) { + SmallVector sourceOffsets = + l3ToL2DmaOps[i].getSourceMixedOffsets(); + SmallVector sourceSizes = + l3ToL2DmaOps[i].getSourceMixedSizes(); + unsigned j = 0, m = sourceOffsets.size(); + // Traverse through the i-th L3->L2 Dma op's source offset/size to find a + // continuous sequence of 0 offset dims with size as 1. + while (j < m) { + std::optional constantOffset = + getConstantIntValue(sourceOffsets[j]); + if (!constantOffset || constantOffset.value() != 0) { + break; + } + std::optional constantSize = getConstantIntValue(sourceSizes[j]); + if (!constantSize || constantSize.value() != 1) { + break; + } + j++; + } + if (i == 0) { + maxSplitDimIndex = j; + } else if (maxSplitDimIndex != j) { + LLVM_DEBUG(llvm::dbgs() + << "incompatible split dimensions across L3->L2\n"); + return failure(); + } + } + SmallVector splitDims(maxSplitDimIndex); + std::iota(splitDims.begin(), splitDims.end(), 0); + SmallVector nonSplitDims(maxSplitDimIndex); + std::iota(nonSplitDims.begin(), nonSplitDims.end(), splitDims.size()); + + // At this point it's nice to perhaps just sort the L3->L2 Dma ops based on + // the "overlapping" offsets. And we'll sort the corresponding L2->L1 Dma ops + // accordingly. + for (int64_t i = 1, n = l3ToL2DmaOps.size(); i < n; i++) { + DmaCpyNdOp currL3ToL2DmaOp = l3ToL2DmaOps[i]; + DmaCpyNdOp currL2ToL1DmaOp = l2ToL1DmaOps[i]; + int64_t j = i - 1; + while (j >= 0 && compareL3ToL2DmaPair(currL3ToL2DmaOp, l3ToL2DmaOps[j])) { + l3ToL2DmaOps[j + 1] = l3ToL2DmaOps[j]; + l2ToL1DmaOps[j + 1] = l2ToL1DmaOps[j]; + j--; + } + l3ToL2DmaOps[j + 1] = currL3ToL2DmaOp; + l2ToL1DmaOps[j + 1] = currL2ToL1DmaOp; + } + + // Currently we have 4 cores so there are two pairs of DmaCpyNds to combine. + // TODO(avarma): Revisit this later when we want to target more no. of cores. + if (l3ToL2DmaOps.size() % 2 != 0) { + LLVM_DEBUG(llvm::dbgs() + << "found uneven L3->L2 ops for combining\n"); + return failure(); + } + + auto createL2ToL1ForReuse = + [](IRRewriter &rewriter, DmaCpyNdOp &l2ToL1DmaOp, + LogicalObjectFifoFromMemrefOp &reuseL1Buffer, + LogicalObjectFifoFromMemrefOp &reuseL2Buffer, + SmallVector &newL2SourceOffsets) -> DmaCpyNdOp { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(l2ToL1DmaOp); + auto newL2ToL1DmaOp = rewriter.create( + l2ToL1DmaOp.getLoc(), reuseL1Buffer, + l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(), + l2ToL1DmaOp.getTargetMixedStrides(), reuseL2Buffer, + llvm::ArrayRef(newL2SourceOffsets), l2ToL1DmaOp.getSourceMixedSizes(), + l2ToL1DmaOp.getSourceMixedStrides()); + rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp); + return newL2ToL1DmaOp; + }; + for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) { + // Step 1. Combine the picked L3->L2 DmaCpyNd pair. + FailureOr maybeNewL2ObjectFifo = + combineL3ToL2Pair(rewriter, l3ToL2DmaOps[i], l3ToL2DmaOps[i + 1], + splitDims, nonSplitDims); + if (failed(maybeNewL2ObjectFifo)) return failure(); + LogicalObjectFifoFromMemrefOp newL2ObjectFifo = + maybeNewL2ObjectFifo.value(); + + // Step 2. We now have need to create two L2->L1 ops since the size has + // changed. But for this we first need to find the new offset for L2 as + // source. + // TODO: For now I'm hardcoding the offsets but later it'd just depend + // on split/non-split dimensions. + // Offset = 0,0 + LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp = + l2ToL1DmaOps[i].getTargetObjectFifo(); + SmallVector newL2AsSourceOffsets = + l2ToL1DmaOps[i].getSourceMixedOffsets(); + DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse( + rewriter, l2ToL1DmaOps[i], reuseL1LogicalObjectFifoOp, newL2ObjectFifo, + newL2AsSourceOffsets); + // Offset = 0, 1. NOTE here we'd use the same L1 logical objectFifo as + // the first L2->L1 Dma. + newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets(); + newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1); + DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse( + rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp, + newL2ObjectFifo, newL2AsSourceOffsets); + + // Step 3. PICK the CoreOps associated with the 1:1 L2->L1. + // For the first Core op we'll insert Read at the end. It doesn't matter + // for now so we're gonna insert it right before amdaie.end op. + CoreOp firstCoreOp = fetchUniqueCoreOp(newFirstL2ToL1DmaOp); + firstCoreOp.walk([&](AMDAIE::EndOp endOp) { + OpBuilder::InsertionGuard guard(rewriter); + // Hardcoding to `AMDAIE::MemoryAccess::Read`. + rewriter.setInsertionPoint(endOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + AMDAIE::MemoryAccess::Read); + }); + // For the second Core op we'll insert `Read` right before the first read + // from the corresponding L1 logicalobjectFifo. + CoreOp secondCoreOp = fetchUniqueCoreOp(newSecondL2ToL1DmaOp); + secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) { + OpBuilder::InsertionGuard guard(rewriter); + // Hardcoding to `AMDAIE::MemoryAccess::Read`. + rewriter.setInsertionPoint(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(), + AMDAIE::MemoryAccess::Read); + // Need to insert the second one because THIS is what will actually + // be used. + auto secondAccessOp = + rewriter.create( + rewriter.getUnknownLoc(), + reuseL1LogicalObjectFifoOp.getOutput(), + AMDAIE::MemoryAccess::Read); + rewriter.replaceOp(accessOp, secondAccessOp); + } + }); + } + + return success(); +} + } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h index 919004949..82b342c48 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h @@ -11,13 +11,20 @@ namespace mlir::iree_compiler::AMDAIE { -/// Utility to split logicalobjectfifos given a struct -/// `SplittingLogicalObjectFifoData` which contains all the required data to -/// perform the splitting. +/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. +SmallVector fetchDmaCpyNdOpsToSplitOrCombine( + ModuleOp moduleOp); + +/// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops. LogicalResult splitLogicalObjectFifos( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, MLIRContext *context); +/// Utility to combine logicalobjectfifos given a vector of L2->L1 dma ops. +LogicalResult combineLogicalObjectFifos( + IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, + MLIRContext *context); + } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp index e6736a7c9..4839246a4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp @@ -16,24 +16,6 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. -static SmallVector fetchDmaCpyNdOpsToSplit( - ModuleOp moduleOp) { - SmallVector l2ToL1DmaOps; - // We are currently walking through CoreOps gathering 3rd Input DmaOp (if - // applicable) from them. - // TODO(avarma): We will generalize this later. - moduleOp.walk([&](AMDAIE::CoreOp coreOp) { - SmallVector inputDmas = coreOp.getInputDmas(); - if (inputDmas.size() != 3) return WalkResult::skip(); - auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); - assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op"); - l2ToL1DmaOps.push_back(dmaCpyNdOp); - return WalkResult::advance(); - }); - return l2ToL1DmaOps; -} - class AMDAIESplitLogicalObjFifosForConnectionReusePass : public impl::AMDAIESplitLogicalObjFifosForConnectionReuseBase< AMDAIESplitLogicalObjFifosForConnectionReusePass> { @@ -53,7 +35,7 @@ void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() { IRRewriter rewriter(context); SmallVector l2ToL1DmaOps = - fetchDmaCpyNdOpsToSplit(moduleOp); + fetchDmaCpyNdOpsToSplitOrCombine(moduleOp); if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) { LLVM_DEBUG(llvm::dbgs() diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 2979c71ef..ca7fc9bd5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -53,6 +53,7 @@ iree_cc_library( "AMDAIECanonicalizeDma.cpp" "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" + "AMDAIECombineLogicalObjFifosForConnectionReuse.cpp" "AMDAIECombineStridedOps.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEConvertCoreForallToFor.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 8912db52d..ec54736db 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -70,6 +70,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT #define GEN_PASS_DEF_AMDAIESINKINTOCORE +#define GEN_PASS_DEF_AMDAIECOMBINELOGICALOBJFIFOSFORCONNECTIONREUSE #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index c1aa45c0b..fa2f73482 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -595,6 +595,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaToCircularDmaPass()); passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index fe5670067..b0689bffb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -230,6 +230,9 @@ std::unique_ptr createAMDAIEPeelForLoopPass( /// Create a pass to sink all dependencies into `amdaie.core` operations. std::unique_ptr createAMDAIESinkIntoCorePass(); +/// Create a pass to combine logicalobjectfifos for connection reuse. +std::unique_ptr createAMDAIECombineLogicalObjFifosForConnectionReusePass(); + /// Create a pass to split logicalobjectfifos for connection reuse. std::unique_ptr createAMDAIESplitLogicalObjFifosForConnectionReusePass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 73ceee040..8ea61e340 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -487,6 +487,11 @@ def AMDAIESplitLogicalObjFifosForConnectionReuse : let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations."; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosForConnectionReusePass()"; } +def AMDAIECombineLogicalObjFifosForConnectionReuse : + Pass<"iree-amdaie-combine-logical-objectfifos-for-connection-reuse", "ModuleOp"> { + let summary = "Pass to combine L2 buffers to share inputs of Matmul and Elementwise operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECombineLogicalObjFifosForConnectionReusePass()"; +} def AMDAIETile : InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index ba4380860..261a8068c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -19,6 +19,7 @@ iree_lit_test_suite( "canonicalize_dma.mlir" "canonicalize_doubly_strided_op.mlir" "canonicalize_npu_dma_cpy_nd.mlir" + "combine_logicalobjfifos_for_connection_reuse.mlir" "combine_strided_ops.mlir" "controlcode_loop_unrolling.mlir" "convert_core_forall_to_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir new file mode 100644 index 000000000..aee2023e3 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_logicalobjfifos_for_connection_reuse.mlir @@ -0,0 +1,219 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-combine-logical-objectfifos-for-connection-reuse,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @combine_logical_objFifos +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK: memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : +// CHECK: linalg.generic +// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[FIRST_READ]] +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[SECOND_READ]] +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : +// CHECK: linalg.generic +// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[FIRST_READ]] +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: %[[SECOND_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read) +// CHECK: linalg.generic +// CHECK-SAME: %[[SECOND_READ]] +// CHECK: amdaie.end +// CHECK: } +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0) -> (d0 * 64 + 32)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @combine_logical_objFifos(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_3 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_4 = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> + %alloc_5 = memref.alloc() : memref<128x128xi32> + %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %tile_7 = amdaie.tile(%c0, %c2) + %tile_8 = amdaie.tile(%c1, %c2) + %tile_9 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %5 = affine.apply #map(%arg5) + %6 = affine.apply #map1(%arg5) + %7 = affine.apply #map(%arg4) + %8 = affine.apply #map1(%arg4) + %9 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %7, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %5] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, %8, %6] [1, 1, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %15 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %13[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %16 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %14[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %17 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_7} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %20 = amdaie.core(%tile_7, in : [%15, %16, %19], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %23 = amdaie.core(%tile, in : [%15, %16, %22], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_8} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %25 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %26 = amdaie.core(%tile_8, in : [%15, %16, %25], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%24, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + %27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %28 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %29 = amdaie.core(%tile_9, in : [%15, %16, %28], out : [%17]) { + %30 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %31 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %32 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%30, %31 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%32 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.muli %in, %in_10 : i32 + %36 = arith.addi %out, %35 : i32 + linalg.yield %36 : i32 + } + %33 = amdaie.logicalobjectfifo.access(%27, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %34 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32, %33 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%34 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_10: i32, %out: i32): + %35 = arith.addi %in, %in_10 : i32 + linalg.yield %35 : i32 + } + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_5 : memref<128x128xi32> + memref.dealloc %alloc_1 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_4 : memref<1x1x32x32xi32, 1 : i32> + return + } +}