Skip to content

Commit

Permalink
Lowering of linalg.copy to dmas (via iree.linalg_ext.pack) (nod-ai#780)
Browse files Browse the repository at this point in the history
Add support for lowering linalg.copy, unify with logic for lowering linalg.pack and linalg.unpack
  • Loading branch information
newling authored Sep 19, 2024
1 parent e142632 commit 671fadc
Show file tree
Hide file tree
Showing 10 changed files with 163 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
#include "llvm/Support/FormatVariadic.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Pass/Pass.h"

#define DEBUG_TYPE "iree-amdaie-pack-to-dma"
#define DEBUG_TYPE "iree-amdaie-convert-to-dma"



namespace mlir::iree_compiler::AMDAIE {

Expand Down Expand Up @@ -302,25 +303,76 @@ LogicalResult rewriteAsDma(PackOrUnpackOp op, IRRewriter &rewriter) {
return rewriteAsDma(rewriter, op, input, output, innerTiles);
}

/// Convert a linalg.copy operation on 2 memrefs to an equivalent pack/unpack
/// operation. If the linalg.copy operation is to a memory closer to the
/// core it is converted to a pack operation, otherwise an unpack operation.
///
/// Note: we could convert all copies to packs, but it would be potentially
/// confusing to have packs ops moving data away from cores.
LogicalResult copyToPack(IRRewriter &rewriter, linalg::CopyOp copyOp) {
if (copyOp.getNumOperands() != 2 || copyOp.getNumResults() != 0) {
copyOp.emitOpError()
<< "has " << copyOp.getNumOperands() << " operands and "
<< copyOp.getNumResults()
<< " results. It must have 2 operands and 0 results to convert "
"to an iree.linalg_ext dialect pack/unpack operation";
return failure();
}
// Setting up the 'identity' pack/unpack:
ArrayRef<int64_t> innerDimsPos{};
ArrayRef<OpFoldResult> innerTiles{};

Value src = copyOp.getOperand(0);
Value dst = copyOp.getOperand(1);

// MemRefTypes with no memory space attribute return 0 here, so this is safe.
uint32_t srcMemspace = cast<MemRefType>(src.getType()).getMemorySpaceAsInt();
uint32_t dstMemspace = cast<MemRefType>(dst.getType()).getMemorySpaceAsInt();
const bool towardsCore = srcMemspace <= dstMemspace;

rewriter.setInsertionPoint(copyOp);
if (towardsCore) {
rewriter.replaceOpWithNewOp<IREE::LinalgExt::PackOp>(
copyOp, src, dst, innerDimsPos, innerTiles);
} else {
rewriter.replaceOpWithNewOp<IREE::LinalgExt::UnPackOp>(
copyOp, src, dst, innerDimsPos, innerTiles);
}

return success();
}

}; // namespace

class AMDAIEPackToDmaPass
: public impl::AMDAIEPackToDmaBase<AMDAIEPackToDmaPass> {
class AMDAIEConvertToDmaPass
: public impl::AMDAIEConvertToDmaBase<AMDAIEConvertToDmaPass> {
public:
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<tensor::TensorDialect, linalg::LinalgDialect,
IREE::LinalgExt::IREELinalgExtDialect, AMDAIEDialect>();
}

AMDAIEPackToDmaPass() = default;
AMDAIEPackToDmaPass(const AMDAIEPackToDmaPass &pass){};
AMDAIEConvertToDmaPass() = default;
AMDAIEConvertToDmaPass(const AMDAIEConvertToDmaPass &pass){};
void runOnOperation() override;
};

void AMDAIEPackToDmaPass::runOnOperation() {
void AMDAIEConvertToDmaPass::runOnOperation() {
MLIRContext *context = &getContext();
IRRewriter rewriter(context);

// Convert all linalg.copy to iree_linalg_ext.pack/unpack ops. We then
// bootstrap the work done for lowering the pack/unpack op to dmas as the next
// step. This is easy to implement, but not the most direct lowering, so
// we might want to revisit this.
WalkResult convertCopiesWalkResult =
getOperation()->walk([&rewriter](linalg::CopyOp copyOp) {
if (failed(copyToPack(rewriter, copyOp)))
return WalkResult::interrupt();
return WalkResult::advance();
});
if (convertCopiesWalkResult.wasInterrupted()) return signalPassFailure();

auto walkResult = getOperation()->walk(
[&rewriter](IREE::LinalgExt::PackOp op) -> WalkResult {
if (failed(rewriteAsDma(op, rewriter))) {
Expand All @@ -339,7 +391,7 @@ void AMDAIEPackToDmaPass::runOnOperation() {
if (walkResult.wasInterrupted()) signalPassFailure();
}

std::unique_ptr<Pass> createAMDAIEPackToDmaPass() {
return std::make_unique<AMDAIEPackToDmaPass>();
std::unique_ptr<Pass> createAMDAIEConvertToDmaPass() {
return std::make_unique<AMDAIEConvertToDmaPass>();
}
} // namespace mlir::iree_compiler::AMDAIE
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ iree_cc_library(
"AMDAIECanonicalizeNpuDmaCpyNd.cpp"
"AMDAIECanonicalizeDoublyStridedOp.cpp"
"AMDAIECombineStridedOps.cpp"
"AMDAIEConvertToDma.cpp"
"AMDAIEControlCodeLoopUnroll.cpp"
"AMDAIEConvertCoreForallToFor.cpp"
"AMDAIECreateAIEWorkgroup.cpp"
Expand Down Expand Up @@ -85,7 +86,6 @@ iree_cc_library(
"AMDAIENormalizeLoopBounds.cpp"
"AMDAIEObjFifoBufferization.cpp"
"AMDAIEPackAndTranspose.cpp"
"AMDAIEPackToDma.cpp"
"AMDAIEPad.cpp"
"AMDAIEPeelForLoop.cpp"
"AMDAIEPropagateDataLayout.cpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ namespace mlir::iree_compiler::AMDAIE {
#define GEN_PASS_DEF_AMDAIENORMALIZELOOPBOUNDS
#define GEN_PASS_DEF_AMDAIEOBJFIFOBUFFERIZATION
#define GEN_PASS_DEF_AMDAIEPACKANDTRANSPOSE
#define GEN_PASS_DEF_AMDAIEPACKTODMA
#define GEN_PASS_DEF_AMDAIECONVERTTODMA
#define GEN_PASS_DEF_AMDAIEPAD
#define GEN_PASS_DEF_AMDAIEPEELFORLOOP
#define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ void buildAMDAIETransformPassPipeline(OpPassManager &variantPassManager,
void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createAMDAIEPackToDmaPass());
passManager.addPass(createAMDAIEConvertToDmaPass());

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
passManager.addPass(createAMDAIEInsertCoresPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,9 @@ std::unique_ptr<Pass> createAMDAIEObjFifoBufferizationPass();
std::unique_ptr<Pass> createAMDAIEPackAndTransposePass(
AMDAIEPackAndTransposeOptions options = {});

/// Create pass to lower pack/unpack ops to AMDAIE DMA ops operating on
/// Create pass to lower copy/pack/unpack ops to AMDAIE DMA ops operating on
/// logical objectFifos.
std::unique_ptr<Pass> createAMDAIEPackToDmaPass();
std::unique_ptr<Pass> createAMDAIEConvertToDmaPass();

/// Create a pass to pad MatmulOp.
std::unique_ptr<Pass> createAMDAIEPadPass(AMDAIEPadOptions options = {});
Expand Down
41 changes: 37 additions & 4 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,43 @@ def AMDAIEPackAndTranspose :
];
}

def AMDAIEPackToDma :
Pass<"iree-amdaie-pack-to-dma", ""> {
let summary = "Convert pack/unpack ops to AMDAIE DMA ops operating on logical objectFifos.";
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPackToDmaPass()";
def AMDAIEConvertToDma :
Pass<"iree-amdaie-convert-to-dma", ""> {
let summary = "Convert linalg ops to AMDAIE DMA ops on logical objectFifos.";
let description = [{
Converts/lowers linalg.copy, iree_linalg_ext.pack, and iree_linalg_ext.unpack
ops to dma operations. For example, given allocactions,

```mlir
%src = memref.alloc() : memref<8x16xi32, 1>
%dst = memref.alloc() : memref<1x1x8x16xi32, 1>
```

and a packing copy operation between them,
```mlir
iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [8, 16]
into %dst : (memref<8x16xi32, 1> memref<1x1x8x16xi32, 1>)
```

the pack gets lowered to,
```mlir
%0 = amdaie.logicalobjectfifo.from_memref %dst, {} : memref<1x1x8x16xi32, 1>
-> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
%1 = amdaie.logicalobjectfifo.from_memref %src, {} : memref<8x16xi32, 1>
-> !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
%2 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1],
%1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) :
(!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>,
!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
```

The approach for converting linalg.copy operations is to first convert them
to identity iree_linalg_ext dialect pack/unpack operations, and then rely on
the lowering of pack/unpack to dma operations to do the heavy lifting.
}

}];
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConvertToDmaPass()";
}

def AMDAIEPad :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ iree_lit_test_suite(
"pack_and_transpose_level1.mlir"
"pack_and_transpose_level2.mlir"
"pack_to_air.mlir"
"pack_to_dma.mlir"
"pack_to_dma_failures.mlir"
"convert_to_dma.mlir"
"convert_to_dma_failures.mlir"
"pad.mlir"
"peel_for_loop.mlir"
"propagate_data_layout.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-opt --iree-amdaie-pack-to-dma --cse --split-input-file %s | FileCheck %s
// RUN: iree-opt --iree-amdaie-convert-to-dma --cse --split-input-file %s | FileCheck %s

// CHECK-LABEL: @basic_unitdim_pack
// CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<1x1x8x16xi32, 1>
Expand Down Expand Up @@ -132,6 +132,7 @@ func.func @permute_unpack() {
}

// -----

// CHECK-LABEL: @subview_unpack
// CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<32x8x64xf32>
// CHECK: %[[FROMMEMREF0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC0]], {} : memref<32x8x64xf32> -> !amdaie.logicalobjectfifo<memref<32x8x64xf32>>
Expand All @@ -157,3 +158,57 @@ func.func @subview_unpack() {
}
return
}

// -----

// CHECK-LABEL: @basic_copy
// CHECK: %[[SRC:.*]] = memref.alloc() : memref<8x16xi32, 1>
// CHECK: %[[FROMSRC:.*]] = amdaie.logicalobjectfifo.from_memref %[[SRC]], {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
// CHECK: %[[DST:.*]] = memref.alloc() : memref<8x16xi32, 1>
// CHECK: %[[FROMDST:.*]] = amdaie.logicalobjectfifo.from_memref %[[DST]], {} : memref<8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
// CHECK: %[[DMA0:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[FROMDST]][0, 0] [8, 16] [16, 1]
// CHECK-SAME: %[[FROMSRC]][0, 0] [8, 16] [16, 1]
// CHECK-SAME: (!amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
func.func @basic_copy() {
%src = memref.alloc() : memref<8x16xi32, 1>
%dst = memref.alloc() : memref<8x16xi32, 1>
linalg.copy ins(%src : memref<8x16xi32, 1>) outs(%dst : memref<8x16xi32, 1>)
return
}

// -----

// CHECK-LABEL: @copy_towards_core
// CHECK: %[[SRC:.*]] = memref.alloc() : memref<8xi32>
// CHECK: %[[FROMSRC:.*]] = amdaie.logicalobjectfifo.from_memref %[[SRC]], {} : memref<8xi32> -> !amdaie.logicalobjectfifo<memref<8xi32>>
// CHECK: %[[DST:.*]] = memref.alloc() : memref<8xi32, 1>
// CHECK: %[[FROMDST:.*]] = amdaie.logicalobjectfifo.from_memref %[[DST]], {} : memref<8xi32, 1> -> !amdaie.logicalobjectfifo<memref<8xi32, 1>>
// CHECK: %[[DMA0:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[FROMDST]][0] [8] [1]
// CHECK-SAME: %[[FROMSRC]][0] [8] [1]
// CHECK-SAME: (!amdaie.logicalobjectfifo<memref<8xi32, 1>>, !amdaie.logicalobjectfifo<memref<8xi32>>)
func.func @copy_towards_core() {
%src = memref.alloc() : memref<8xi32>
%dst = memref.alloc() : memref<8xi32, 1>
linalg.copy ins(%src : memref<8xi32>) outs(%dst : memref<8xi32, 1>)
return
}

// -----

// CHECK-LABEL: @copy_away_from_core
// CHECK: %[[SRC:.*]] = memref.alloc() : memref<8xi32, 2>
// CHECK: %[[FROMSRC:.*]] = amdaie.logicalobjectfifo.from_memref %[[SRC]], {} : memref<8xi32, 2> -> !amdaie.logicalobjectfifo<memref<8xi32, 2>>
// CHECK: %[[DST:.*]] = memref.alloc() : memref<8xi32, 1>
// CHECK: %[[FROMDST:.*]] = amdaie.logicalobjectfifo.from_memref %[[DST]], {} : memref<8xi32, 1> -> !amdaie.logicalobjectfifo<memref<8xi32, 1>>
// CHECK: %[[DMA0:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[FROMDST]][0] [8] [1]
// CHECK-SAME: %[[FROMSRC]][0] [8] [1]
// CHECK-SAME: (!amdaie.logicalobjectfifo<memref<8xi32, 1>>, !amdaie.logicalobjectfifo<memref<8xi32, 2>>)
func.func @copy_away_from_core() {
%src = memref.alloc() : memref<8xi32, 2>
%dst = memref.alloc() : memref<8xi32, 1>
linalg.copy ins(%src : memref<8xi32, 2>) outs(%dst : memref<8xi32, 1>)
return
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-opt %s --iree-amdaie-pack-to-dma -verify-diagnostics
// RUN: iree-opt %s --iree-amdaie-convert-to-dma -verify-diagnostics

#map = affine_map<()[s0] -> (s0 * 8)>

Expand Down
2 changes: 1 addition & 1 deletion tests/samples/matmul_pack_peel_objectfifo.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,canonicalize,iree-amdaie-canonicalize-npu-dma-cpy-nd,canonicalize,iree-amdaie-sink-into-core,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s
// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-convert-to-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-dma-loop-subsumption,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,iree-amdaie-dma-cse,iree-amdaie-create-logical-objectfifo-link,iree-amdaie-canonicalize-doubly-strided-op,canonicalize,iree-amdaie-canonicalize-npu-dma-cpy-nd,canonicalize,iree-amdaie-sink-into-core,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s

// CHECK: aie.device(npu1_4col)
// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2)
Expand Down

0 comments on commit 671fadc

Please sign in to comment.