Skip to content

Commit

Permalink
[NpuDmaCpyNdOp/NpuDmaWaitOp] Return optional async token and wait for…
Browse files Browse the repository at this point in the history
… multiple (nod-ai#827)

This PR makes the return type of `AMDAIE::NpuDmaCpyNdOp` optional and
more explicit by introducing an `amdaie.async_token` type. The
`AMDAIE::NpuDmaWaitOp` will now operate on operands of the async token
type . This is consistent with how `gpu.async_token` is used in the GPU
dialect.
  • Loading branch information
jtuyls authored Oct 7, 2024
1 parent c2fd592 commit 0757023
Show file tree
Hide file tree
Showing 21 changed files with 659 additions and 559 deletions.
95 changes: 65 additions & 30 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -660,11 +660,12 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result,
// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and
// source BD IDs.
void NpuDmaCpyNdOp::build(
OpBuilder &b, OperationState &result, Value dma, Value target,
ArrayRef<OpFoldResult> targetOffsets, ArrayRef<OpFoldResult> targetSizes,
ArrayRef<OpFoldResult> targetStrides, Value targetBdId, Value source,
ArrayRef<OpFoldResult> sourceOffsets, ArrayRef<OpFoldResult> sourceSizes,
ArrayRef<OpFoldResult> sourceStrides, Value sourceBdId) {
OpBuilder &b, OperationState &result, TypeRange resultTypes,
Value connection, Value target, ArrayRef<OpFoldResult> targetOffsets,
ArrayRef<OpFoldResult> targetSizes, ArrayRef<OpFoldResult> targetStrides,
Value targetBdId, Value source, ArrayRef<OpFoldResult> sourceOffsets,
ArrayRef<OpFoldResult> sourceSizes, ArrayRef<OpFoldResult> sourceStrides,
Value sourceBdId) {
SmallVector<int64_t> staticTargetOffsets, staticTargetSizes,
staticTargetStrides;
SmallVector<int64_t> staticSourceOffsets, staticSourceSizes,
Expand All @@ -685,7 +686,7 @@ void NpuDmaCpyNdOp::build(
staticSourceSizes);
dispatchIndexOpFoldResults(sourceStrides, dynamicSourceStrides,
staticSourceStrides);
build(b, result, b.getIndexType(), dma, target, dynamicTargetOffsets,
build(b, result, resultTypes, connection, target, dynamicTargetOffsets,
dynamicTargetSizes, dynamicTargetStrides, staticTargetOffsets,
staticTargetSizes, staticTargetStrides, targetBdId, source,
dynamicSourceOffsets, dynamicSourceSizes, dynamicSourceStrides,
Expand All @@ -695,11 +696,12 @@ void NpuDmaCpyNdOp::build(

// Build a NpuDmaCpyNdOp with static entries.
void NpuDmaCpyNdOp::build(
OpBuilder &b, OperationState &result, Value dma, Value target,
ArrayRef<int64_t> targetOffsets, ArrayRef<int64_t> targetSizes,
ArrayRef<int64_t> targetStrides, mlir::Value targetBdId, Value source,
ArrayRef<int64_t> sourceOffsets, ArrayRef<int64_t> sourceSizes,
ArrayRef<int64_t> sourceStrides, mlir::Value sourceBdId) {
OpBuilder &b, OperationState &result, TypeRange resultTypes,
Value connection, Value target, ArrayRef<int64_t> targetOffsets,
ArrayRef<int64_t> targetSizes, ArrayRef<int64_t> targetStrides,
mlir::Value targetBdId, Value source, ArrayRef<int64_t> sourceOffsets,
ArrayRef<int64_t> sourceSizes, ArrayRef<int64_t> sourceStrides,
mlir::Value sourceBdId) {
SmallVector<OpFoldResult> targetOffsetValues = llvm::to_vector<4>(
llvm::map_range(targetOffsets, [&](int64_t v) -> OpFoldResult {
return b.getI64IntegerAttr(v);
Expand All @@ -724,18 +726,19 @@ void NpuDmaCpyNdOp::build(
llvm::map_range(sourceStrides, [&](int64_t v) -> OpFoldResult {
return b.getI64IntegerAttr(v);
}));
build(b, result, dma, target, targetOffsetValues, targetSizeValues,
targetStrideValues, targetBdId, source, sourceOffsetValues,
sourceSizeValues, sourceStrideValues, sourceBdId);
build(b, result, resultTypes, connection, target, targetOffsetValues,
targetSizeValues, targetStrideValues, targetBdId, source,
sourceOffsetValues, sourceSizeValues, sourceStrideValues, sourceBdId);
}

// Build a NpuDmaCpyNdOp with dynamic entries.
void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma,
Value target, ValueRange targetOffsets,
ValueRange targetSizes, ValueRange targetStrides,
mlir::Value targetBdId, Value source,
ValueRange sourceOffsets, ValueRange sourceSizes,
ValueRange sourceStrides, mlir::Value sourceBdId) {
void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
TypeRange resultTypes, Value connection, Value target,
ValueRange targetOffsets, ValueRange targetSizes,
ValueRange targetStrides, mlir::Value targetBdId,
Value source, ValueRange sourceOffsets,
ValueRange sourceSizes, ValueRange sourceStrides,
mlir::Value sourceBdId) {
SmallVector<OpFoldResult> targetOffsetValues =
llvm::to_vector<4>(llvm::map_range(
targetOffsets, [](Value v) -> OpFoldResult { return v; }));
Expand All @@ -752,13 +755,20 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma,
SmallVector<OpFoldResult> sourceStrideValues =
llvm::to_vector<4>(llvm::map_range(
sourceStrides, [](Value v) -> OpFoldResult { return v; }));
build(b, result, dma, target, targetOffsetValues, targetSizeValues,
targetStrideValues, targetBdId, source, sourceOffsetValues,
sourceSizeValues, sourceStrideValues, sourceBdId);
build(b, result, resultTypes, connection, target, targetOffsetValues,
targetSizeValues, targetStrideValues, targetBdId, source,
sourceOffsetValues, sourceSizeValues, sourceStrideValues, sourceBdId);
}

void NpuDmaCpyNdOp::print(OpAsmPrinter &p) {
Operation *op = getOperation();
for (OpResult res : getAsyncTokens()) {
if (isa<AMDAIE::AsyncTargetTokenType>(res.getType())) {
p << " async_target";
} else if (isa<AMDAIE::AsyncSourceTokenType>(res.getType())) {
p << " async_source";
}
}
p << " " << getConnection() << "(";
if (getTarget()) p << getTarget();
printDynamicIndexList(p, op, getTargetOffsets(), getTargetStaticOffsets());
Expand Down Expand Up @@ -806,6 +816,16 @@ ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) {
sourceDynamicSizes, sourceDynamicStrides;
SmallVector<Type, 1> targetTypes;
SmallVector<Type, 1> sourceTypes;
SmallVector<Type, 1> asyncTokenTypes;

if (succeeded(parser.parseOptionalKeyword("async_target"))) {
asyncTokenTypes.push_back(
parser.getBuilder().getType<AMDAIE::AsyncTargetTokenType>());
}
if (succeeded(parser.parseOptionalKeyword("async_source"))) {
asyncTokenTypes.push_back(
parser.getBuilder().getType<AMDAIE::AsyncSourceTokenType>());
}

if (failed(parser.parseOperand(dma)) || failed(parser.parseLParen()))
return failure();
Expand Down Expand Up @@ -899,6 +919,8 @@ ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) {
}
}

result.addTypes(asyncTokenTypes);

llvm::copy(
ArrayRef<int32_t>({1, static_cast<int32_t>(targetOperands.size()),
static_cast<int32_t>(targetDynamicOffsets.size()),
Expand Down Expand Up @@ -955,8 +977,6 @@ ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) {
result.operands))) {
return failure();
}

result.addTypes(indexType);
return success();
}

Expand All @@ -970,9 +990,9 @@ DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp(
::llvm::SmallVector<OpFoldResult> &newSourceStrides) {
Location loc = (*this)->getLoc();
auto newOp = rewriter.create<AMDAIE::NpuDmaCpyNdOp>(
loc, getConnection(), getTarget(), newTargetOffsets, newTargetSizes,
newTargetStrides, getTargetBdId(), getSource(), newSourceOffsets,
newSourceSizes, newSourceStrides, getSourceBdId());
loc, getResultTypes(), getConnection(), getTarget(), newTargetOffsets,
newTargetSizes, newTargetStrides, getTargetBdId(), getSource(),
newSourceOffsets, newSourceSizes, newSourceStrides, getSourceBdId());
return cast<DoublyStridedOpInterface>(newOp.getOperation());
}

Expand All @@ -991,8 +1011,8 @@ struct NpuDmaCpyNdOpReplacementBuilder {
ArrayRef<OpFoldResult> srcMixedSizes,
ArrayRef<OpFoldResult> srcMixedStrides) {
rewriter.replaceOpWithNewOp<NpuDmaCpyNdOp>(
dmaOp, dmaOp.getConnection(), dmaOp.getTarget(), tgtMixedOffsets,
tgtMixedSizes, tgtMixedStrides, dmaOp.getTargetBdId(),
dmaOp, dmaOp.getResultTypes(), dmaOp.getConnection(), dmaOp.getTarget(),
tgtMixedOffsets, tgtMixedSizes, tgtMixedStrides, dmaOp.getTargetBdId(),
dmaOp.getSource(), srcMixedOffsets, srcMixedSizes, srcMixedStrides,
dmaOp.getSourceBdId());
}
Expand Down Expand Up @@ -1150,6 +1170,21 @@ void NpuCircularDmaCpyNdOp::getCanonicalizationPatterns(
context);
}

//===----------------------------------------------------------------------===//
// AMDAIE_NpuDmaWaitOp
//===----------------------------------------------------------------------===//

SmallVector<AMDAIE::NpuDmaCpyNdOp> NpuDmaWaitOp::getDmaOps() {
SmallVector<AMDAIE::NpuDmaCpyNdOp> dmaOps;
for (Value token : getAsyncTokens()) {
if (auto dmaOp =
dyn_cast_if_present<AMDAIE::NpuDmaCpyNdOp>(token.getDefiningOp())) {
dmaOps.push_back(dmaOp);
}
}
return dmaOps;
}

//===----------------------------------------------------------------------===//
// AMDAIE_TileOp
//===----------------------------------------------------------------------===//
Expand Down
93 changes: 55 additions & 38 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,7 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
//===----------------------------------------------------------------------===//

def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
AttrSizedOperandSegments, DoublyStridedOpInterface]>,
Results<(outs Index)> {
AttrSizedOperandSegments, DoublyStridedOpInterface]> {
let summary = "The Npu uController's dma operator";
let description = [{
The Npu DMA operation represents a strided copy operation with an unlimited
Expand All @@ -393,6 +392,16 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
source and target `offsets`, `sizes` and `strides`. A special sentinel value
ShapedType::kDynamic encodes that the corresponding entry has a dynamic value.

The op can be `async` by specifying one or more `AnyAsyncTokenType` results.
Other wait-like operations can use these async tokens to describe a blocking
operation on the source or/and target side of this DMA operation.
In case of `async_source`, an async token will be returned by this operation
when the source DMA port will be done executing.
In case of `async_target`, an async token will be returned by this operation
when the target DMA port will be done executing.
In case of both `async_source` and `async_target`, two tokens will be returned
which can be blocked on.

Example:

```mlir
Expand All @@ -401,7 +410,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
!amdaie.logicalobjectfifo<memref<32x1024xi32>>)
...
amdaie.controlcode {
%3 = amdaie.npu.dma_cpy_nd %2([] [] [], [0, 0] [32, 64] [1024, 1])
%3 = amdaie.npu.dma_cpy_nd async_source %2([] [] [], [0, 0] [32, 64] [1024, 1])
...
}
```
Expand All @@ -427,33 +436,35 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
Optional<Index>:$source_bd_id
);

let results = (outs Variadic<AMDAIE_AnyAsyncTokenType>:$async_tokens);

// Use a custom assembly format because of weird spaces being inserted around
// the optional `target` by the default assembly format generator.
let hasCustomAssemblyFormat = 1;

let builders = [
// Build a NpuDmaCpyNdOp with mixed static and dynamic entries.
OpBuilder<(ins "Value":$dma, "::mlir::Value":$target,
"ArrayRef<OpFoldResult>":$target_offsets,
OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
"::mlir::Value":$target, "ArrayRef<OpFoldResult>":$target_offsets,
"ArrayRef<OpFoldResult>":$target_sizes,
"ArrayRef<OpFoldResult>":$target_strides, "::mlir::Value":$target_bd_id,
"::mlir::Value":$source, "ArrayRef<OpFoldResult>":$source_offsets,
"ArrayRef<OpFoldResult>":$source_sizes,
"ArrayRef<OpFoldResult>":$source_strides, "::mlir::Value":$source_bd_id)>,
// Build a NpuDmaCpyNdOp with static entries.
OpBuilder<(ins "Value":$dma, "::mlir::Value":$target,
"ArrayRef<int64_t>":$target_offsets, "ArrayRef<int64_t>":$target_sizes,
"ArrayRef<int64_t>":$target_strides, "::mlir::Value":$target_bd_id,
"::mlir::Value":$source, "ArrayRef<int64_t>":$source_offsets,
"ArrayRef<int64_t>":$source_sizes, "ArrayRef<int64_t>":$source_strides,
"::mlir::Value":$source_bd_id)>,
OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
"::mlir::Value":$target, "ArrayRef<int64_t>":$target_offsets,
"ArrayRef<int64_t>":$target_sizes, "ArrayRef<int64_t>":$target_strides,
"::mlir::Value":$target_bd_id, "::mlir::Value":$source,
"ArrayRef<int64_t>":$source_offsets, "ArrayRef<int64_t>":$source_sizes,
"ArrayRef<int64_t>":$source_strides, "::mlir::Value":$source_bd_id)>,
// Build a NpuDmaCpyNdOp with dynamic entries.
OpBuilder<(ins "Value":$dma, "::mlir::Value":$target,
"ValueRange":$target_offsets, "ValueRange":$target_sizes,
"ValueRange":$target_strides, "::mlir::Value":$target_bd_id,
"::mlir::Value":$source, "ValueRange":$source_offsets,
"ValueRange":$source_sizes, "ValueRange":$source_strides,
"::mlir::Value":$source_bd_id)>
OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$dma,
"::mlir::Value":$target, "ValueRange":$target_offsets,
"ValueRange":$target_sizes, "ValueRange":$target_strides,
"::mlir::Value":$target_bd_id, "::mlir::Value":$source,
"ValueRange":$source_offsets, "ValueRange":$source_sizes,
"ValueRange":$source_strides, "::mlir::Value":$source_bd_id)>
];

let extraClassDeclaration = [{
Expand Down Expand Up @@ -702,45 +713,51 @@ def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
let summary = "Wait for the Npu DMA operation to complete.";
let description = [{
The wait operation will block on the referenced Npu DMA operation to complete
execution on the provided `direction`. The `S2MM` direction will block on the
destination side of the dma operation, ensuring complete execution. The
`MM2S` direction will block on the source side of the dma operation,
ensuring that the DMA has successfully started execution, but not
guaranteeing that all data has been received on the destination side.
The wait operation will block on the referenced dependent ops.

If a dependent op returns a `!amdaie.async_token`, this wait op will block
on the dependent op having completed execution.
If a dependent op returns a `!amdaie.async_source_token`, this wait op will
block on the source side of the referenced dependent op having completed
execution.
If a dependent op returns a `!amdaie.async_target_token`, this wait op will
block on the target side of the referenced dependent op having completed
execution.

Being able to block on the source and/or target side separately is useful
for copy/dma-like operations that involve multiple physical ports/channels
in hardware. In this case, blocking on the source side and/or target side,
might be different from blocking on the entire operation.

Example:

```mlir
%2 = amdaie.npu.dma_cpy_nd %0([] [] [], [%c0, %c0] [%c32, %c64] [%c1024, %c1])
amdaie.npu.dma_wait(%2, MM2S)
%2 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], [%c0, %c0] [%c32, %c64] [%c1024, %c1])
amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
```

Here, the `dma_wait` operation will wait until the referenced Npu DMA
operation has started execution. On the other hand, the `S2MM` direction can
be used to wait on the destination side of the DMA, i.e. until the DMA has
finished its write into the target memory:
operation has started execution. On the other hand, the
`!amdaie.async_target_token` can be used to wait on the target side of the
DMA, i.e. until the DMA has finished its write into the target memory:

```mlir
%2 = amdaie.npu.dma_cpy_nd %0([%c0, %c0] [%c32, %c64] [%c1024, %c1], [] [] [])
amdaie.npu.dma_wait(%2, S2MM)
%2 = amdaie.npu.dma_cpy_nd async_target %0([%c0, %c0] [%c32, %c64] [%c1024, %c1], [] [] [])
amdaie.npu.dma_wait(%2 : !amdaie.async_target_token)
```
}];

let arguments = (
ins Index:$dma,
DMAChannelDir:$direction
ins Variadic<AMDAIE_AnyAsyncTokenType>:$async_tokens
);

let assemblyFormat = [{
`(` $dma `,` $direction `)` attr-dict
(`(` $async_tokens^ `:` type($async_tokens) `)`)? attr-dict
}];

let extraClassDeclaration = [{
// Return the Npu DMA operation argument.
NpuDmaCpyNdOp getDmaOp() {
return dyn_cast_if_present<NpuDmaCpyNdOp>(getDma().getDefiningOp());
}
// Return the Npu DMA operation arguments.
SmallVector<NpuDmaCpyNdOp> getDmaOps();
}];
}

Expand Down Expand Up @@ -1073,7 +1090,7 @@ def AMDAIE_LogicalObjectFifoPlaceholderOp:
amdaie.controlcode {
%obj1 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0}
: memref<1024xi32> -> !amdaie.logicalobjectfifo<memref<1024xi32>>
%npu_dma = amdaie.npu.dma_cpy_nd %connection([] [] [],
%npu_dma = amdaie.npu.dma_cpy_nd async_source %connection([] [] [],
%obj0[%c0, %c32] [%c32, %c32] [%c32, %c1])
: source_type = !amdaie.logicalobjectfifo<memref<1024xi32>>
amdaie.end
Expand Down
10 changes: 10 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ class AMDAIEDialect_Type<string name, string typeMnemonic, list<Trait> traits =
let mnemonic = typeMnemonic;
}

// The types for async tokens which can be returned from async operations. The
// dedicated types for `source` and `target` can be used to specify on which
// side of (for example a copy/DMA) operation should be synchronized if there
// are multiple.
def AMDAIE_AsyncTokenType : AMDAIEDialect_Type<"AsyncToken", "async_token">;
def AMDAIE_AsyncSourceTokenType : AMDAIEDialect_Type<"AsyncSourceToken", "async_source_token">;
def AMDAIE_AsyncTargetTokenType : AMDAIEDialect_Type<"AsyncTargetToken", "async_target_token">;
def AMDAIE_AnyAsyncTokenType
: AnyTypeOf<[AMDAIE_AsyncTokenType, AMDAIE_AsyncSourceTokenType, AMDAIE_AsyncTargetTokenType]>;

def AMDAIE_LogicalObjectFifoType :
AMDAIEDialect_Type<"LogicalObjectFifo", "logicalobjectfifo"> {
let summary = "The logical objectfifo type encapsulating a memref";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class DmaOpInterfaceTest : public ::testing::Test {
auto input =
rewriter.create<arith::ConstantIndexOp>(rewriter.getUnknownLoc(), 2);
auto dmaOp = rewriter.create<mlir::iree_compiler::AMDAIE::NpuDmaCpyNdOp>(
rewriter.getUnknownLoc(), input, target, targetOffsetsOfr,
rewriter.getUnknownLoc(), TypeRange{}, input, target, targetOffsetsOfr,
targetSizesOfr, targetStridesOfr, nullptr, source, sourceOffsetsOfr,
sourceSizesOfr, sourceStridesOfr, nullptr);
std::optional<int64_t> sourceStaticSize = dmaOp.getSourceStaticSize();
Expand Down
Loading

0 comments on commit 0757023

Please sign in to comment.