[NpuDmaCpyNdOp/NpuDmaWaitOp] Return optional async token and wait for…

… multiple (nod-ai#827) This PR makes the return type of `AMDAIE::NpuDmaCpyNdOp` optional and more explicit by introducing an `amdaie.async_token` type. The `AMDAIE::NpuDmaWaitOp` will now operate on operands of the async token type . This is consistent with how `gpu.async_token` is used in the GPU dialect.
newling · Oct 7, 2024 · 0757023 · 0757023
1 parent c2fd592
commit 0757023
Show file tree

Hide file tree

Showing 21 changed files with 659 additions and 559 deletions.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -660,11 +660,12 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result,
 // Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and
 // source BD IDs.
 void NpuDmaCpyNdOp::build(
-    OpBuilder &b, OperationState &result, Value dma, Value target,
-    ArrayRef<OpFoldResult> targetOffsets, ArrayRef<OpFoldResult> targetSizes,
-    ArrayRef<OpFoldResult> targetStrides, Value targetBdId, Value source,
-    ArrayRef<OpFoldResult> sourceOffsets, ArrayRef<OpFoldResult> sourceSizes,
-    ArrayRef<OpFoldResult> sourceStrides, Value sourceBdId) {
+    OpBuilder &b, OperationState &result, TypeRange resultTypes,
+    Value connection, Value target, ArrayRef<OpFoldResult> targetOffsets,
+    ArrayRef<OpFoldResult> targetSizes, ArrayRef<OpFoldResult> targetStrides,
+    Value targetBdId, Value source, ArrayRef<OpFoldResult> sourceOffsets,
+    ArrayRef<OpFoldResult> sourceSizes, ArrayRef<OpFoldResult> sourceStrides,
+    Value sourceBdId) {
   SmallVector<int64_t> staticTargetOffsets, staticTargetSizes,
       staticTargetStrides;
   SmallVector<int64_t> staticSourceOffsets, staticSourceSizes,
@@ -685,7 +686,7 @@ void NpuDmaCpyNdOp::build(
                              staticSourceSizes);
   dispatchIndexOpFoldResults(sourceStrides, dynamicSourceStrides,
                              staticSourceStrides);
-  build(b, result, b.getIndexType(), dma, target, dynamicTargetOffsets,
+  build(b, result, resultTypes, connection, target, dynamicTargetOffsets,
         dynamicTargetSizes, dynamicTargetStrides, staticTargetOffsets,
         staticTargetSizes, staticTargetStrides, targetBdId, source,
         dynamicSourceOffsets, dynamicSourceSizes, dynamicSourceStrides,
@@ -695,11 +696,12 @@ void NpuDmaCpyNdOp::build(
 
 // Build a NpuDmaCpyNdOp with static entries.
 void NpuDmaCpyNdOp::build(
-    OpBuilder &b, OperationState &result, Value dma, Value target,
-    ArrayRef<int64_t> targetOffsets, ArrayRef<int64_t> targetSizes,
-    ArrayRef<int64_t> targetStrides, mlir::Value targetBdId, Value source,
-    ArrayRef<int64_t> sourceOffsets, ArrayRef<int64_t> sourceSizes,
-    ArrayRef<int64_t> sourceStrides, mlir::Value sourceBdId) {
+    OpBuilder &b, OperationState &result, TypeRange resultTypes,
+    Value connection, Value target, ArrayRef<int64_t> targetOffsets,
+    ArrayRef<int64_t> targetSizes, ArrayRef<int64_t> targetStrides,
+    mlir::Value targetBdId, Value source, ArrayRef<int64_t> sourceOffsets,
+    ArrayRef<int64_t> sourceSizes, ArrayRef<int64_t> sourceStrides,
+    mlir::Value sourceBdId) {
   SmallVector<OpFoldResult> targetOffsetValues = llvm::to_vector<4>(
       llvm::map_range(targetOffsets, [&](int64_t v) -> OpFoldResult {
         return b.getI64IntegerAttr(v);
@@ -724,18 +726,19 @@ void NpuDmaCpyNdOp::build(
       llvm::map_range(sourceStrides, [&](int64_t v) -> OpFoldResult {
         return b.getI64IntegerAttr(v);
       }));
-  build(b, result, dma, target, targetOffsetValues, targetSizeValues,
-        targetStrideValues, targetBdId, source, sourceOffsetValues,
-        sourceSizeValues, sourceStrideValues, sourceBdId);
+  build(b, result, resultTypes, connection, target, targetOffsetValues,
+        targetSizeValues, targetStrideValues, targetBdId, source,
+        sourceOffsetValues, sourceSizeValues, sourceStrideValues, sourceBdId);
 }
 
 // Build a NpuDmaCpyNdOp with dynamic entries.
-void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma,
-                          Value target, ValueRange targetOffsets,
-                          ValueRange targetSizes, ValueRange targetStrides,
-                          mlir::Value targetBdId, Value source,
-                          ValueRange sourceOffsets, ValueRange sourceSizes,
-                          ValueRange sourceStrides, mlir::Value sourceBdId) {
+void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
+                          TypeRange resultTypes, Value connection, Value target,
+                          ValueRange targetOffsets, ValueRange targetSizes,
+                          ValueRange targetStrides, mlir::Value targetBdId,
+                          Value source, ValueRange sourceOffsets,
+                          ValueRange sourceSizes, ValueRange sourceStrides,
+                          mlir::Value sourceBdId) {
   SmallVector<OpFoldResult> targetOffsetValues =
       llvm::to_vector<4>(llvm::map_range(
           targetOffsets, [](Value v) -> OpFoldResult { return v; }));
@@ -752,13 +755,20 @@ void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma,
   SmallVector<OpFoldResult> sourceStrideValues =
       llvm::to_vector<4>(llvm::map_range(
           sourceStrides, [](Value v) -> OpFoldResult { return v; }));
-  build(b, result, dma, target, targetOffsetValues, targetSizeValues,
-        targetStrideValues, targetBdId, source, sourceOffsetValues,
-        sourceSizeValues, sourceStrideValues, sourceBdId);
+  build(b, result, resultTypes, connection, target, targetOffsetValues,
+        targetSizeValues, targetStrideValues, targetBdId, source,
+        sourceOffsetValues, sourceSizeValues, sourceStrideValues, sourceBdId);
 }
 
 void NpuDmaCpyNdOp::print(OpAsmPrinter &p) {
   Operation *op = getOperation();
+  for (OpResult res : getAsyncTokens()) {
+    if (isa<AMDAIE::AsyncTargetTokenType>(res.getType())) {
+      p << " async_target";
+    } else if (isa<AMDAIE::AsyncSourceTokenType>(res.getType())) {
+      p << " async_source";
+    }
+  }
   p << " " << getConnection() << "(";
   if (getTarget()) p << getTarget();
   printDynamicIndexList(p, op, getTargetOffsets(), getTargetStaticOffsets());
@@ -806,6 +816,16 @@ ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) {
       sourceDynamicSizes, sourceDynamicStrides;
   SmallVector<Type, 1> targetTypes;
   SmallVector<Type, 1> sourceTypes;
+  SmallVector<Type, 1> asyncTokenTypes;
+
+  if (succeeded(parser.parseOptionalKeyword("async_target"))) {
+    asyncTokenTypes.push_back(
+        parser.getBuilder().getType<AMDAIE::AsyncTargetTokenType>());
+  }
+  if (succeeded(parser.parseOptionalKeyword("async_source"))) {
+    asyncTokenTypes.push_back(
+        parser.getBuilder().getType<AMDAIE::AsyncSourceTokenType>());
+  }
 
   if (failed(parser.parseOperand(dma)) || failed(parser.parseLParen()))
     return failure();
@@ -899,6 +919,8 @@ ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) {
     }
   }
 
+  result.addTypes(asyncTokenTypes);
+
   llvm::copy(
       ArrayRef<int32_t>({1, static_cast<int32_t>(targetOperands.size()),
                          static_cast<int32_t>(targetDynamicOffsets.size()),
@@ -955,8 +977,6 @@ ParseResult NpuDmaCpyNdOp::parse(OpAsmParser &parser, OperationState &result) {
                                     result.operands))) {
     return failure();
   }
-
-  result.addTypes(indexType);
   return success();
 }
 
@@ -970,9 +990,9 @@ DoublyStridedOpInterface NpuDmaCpyNdOp::createDoublyStridedOp(
     ::llvm::SmallVector<OpFoldResult> &newSourceStrides) {
   Location loc = (*this)->getLoc();
   auto newOp = rewriter.create<AMDAIE::NpuDmaCpyNdOp>(
-      loc, getConnection(), getTarget(), newTargetOffsets, newTargetSizes,
-      newTargetStrides, getTargetBdId(), getSource(), newSourceOffsets,
-      newSourceSizes, newSourceStrides, getSourceBdId());
+      loc, getResultTypes(), getConnection(), getTarget(), newTargetOffsets,
+      newTargetSizes, newTargetStrides, getTargetBdId(), getSource(),
+      newSourceOffsets, newSourceSizes, newSourceStrides, getSourceBdId());
   return cast<DoublyStridedOpInterface>(newOp.getOperation());
 }
 
@@ -991,8 +1011,8 @@ struct NpuDmaCpyNdOpReplacementBuilder {
                       ArrayRef<OpFoldResult> srcMixedSizes,
                       ArrayRef<OpFoldResult> srcMixedStrides) {
     rewriter.replaceOpWithNewOp<NpuDmaCpyNdOp>(
-        dmaOp, dmaOp.getConnection(), dmaOp.getTarget(), tgtMixedOffsets,
-        tgtMixedSizes, tgtMixedStrides, dmaOp.getTargetBdId(),
+        dmaOp, dmaOp.getResultTypes(), dmaOp.getConnection(), dmaOp.getTarget(),
+        tgtMixedOffsets, tgtMixedSizes, tgtMixedStrides, dmaOp.getTargetBdId(),
         dmaOp.getSource(), srcMixedOffsets, srcMixedSizes, srcMixedStrides,
         dmaOp.getSourceBdId());
   }
@@ -1150,6 +1170,21 @@ void NpuCircularDmaCpyNdOp::getCanonicalizationPatterns(
       context);
 }
 
+//===----------------------------------------------------------------------===//
+// AMDAIE_NpuDmaWaitOp
+//===----------------------------------------------------------------------===//
+
+SmallVector<AMDAIE::NpuDmaCpyNdOp> NpuDmaWaitOp::getDmaOps() {
+  SmallVector<AMDAIE::NpuDmaCpyNdOp> dmaOps;
+  for (Value token : getAsyncTokens()) {
+    if (auto dmaOp =
+            dyn_cast_if_present<AMDAIE::NpuDmaCpyNdOp>(token.getDefiningOp())) {
+      dmaOps.push_back(dmaOp);
+    }
+  }
+  return dmaOps;
+}
+
 //===----------------------------------------------------------------------===//
 // AMDAIE_TileOp
 //===----------------------------------------------------------------------===//

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -379,8 +379,7 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
 //===----------------------------------------------------------------------===//
 
 def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
-      AttrSizedOperandSegments, DoublyStridedOpInterface]>,
-    Results<(outs Index)> {
+      AttrSizedOperandSegments, DoublyStridedOpInterface]> {
   let summary = "The Npu uController's dma operator";
   let description = [{
     The Npu DMA operation represents a strided copy operation with an unlimited
@@ -393,6 +392,16 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
     source and target `offsets`, `sizes` and `strides`. A special sentinel value
     ShapedType::kDynamic encodes that the corresponding entry has a dynamic value.
 
+    The op can be `async` by specifying one or more `AnyAsyncTokenType` results.
+    Other wait-like operations can use these async tokens to describe a blocking
+    operation on the source or/and target side of this DMA operation.
+    In case of `async_source`, an async token will be returned by this operation
+    when the source DMA port will be done executing.
+    In case of `async_target`, an async token will be returned by this operation
+    when the target DMA port will be done executing.
+    In case of both `async_source` and `async_target`, two tokens will be returned
+    which can be blocked on.
+
     Example:
 
     ```mlir
@@ -401,7 +410,7 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
       !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
     ...
     amdaie.controlcode {
-      %3 = amdaie.npu.dma_cpy_nd %2([] [] [], [0, 0] [32, 64] [1024, 1])
+      %3 = amdaie.npu.dma_cpy_nd async_source %2([] [] [], [0, 0] [32, 64] [1024, 1])
       ...
     }
     ```
@@ -427,33 +436,35 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
         Optional<Index>:$source_bd_id
   );
 
+  let results = (outs Variadic<AMDAIE_AnyAsyncTokenType>:$async_tokens);
+
   // Use a custom assembly format because of weird spaces being inserted around 
   // the optional `target` by the default assembly format generator.
   let hasCustomAssemblyFormat = 1;
 
   let builders = [
     // Build a NpuDmaCpyNdOp with mixed static and dynamic entries.
-    OpBuilder<(ins "Value":$dma, "::mlir::Value":$target,
-      "ArrayRef<OpFoldResult>":$target_offsets,
+    OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
+      "::mlir::Value":$target, "ArrayRef<OpFoldResult>":$target_offsets,
       "ArrayRef<OpFoldResult>":$target_sizes,
       "ArrayRef<OpFoldResult>":$target_strides, "::mlir::Value":$target_bd_id,
       "::mlir::Value":$source, "ArrayRef<OpFoldResult>":$source_offsets,
       "ArrayRef<OpFoldResult>":$source_sizes,
       "ArrayRef<OpFoldResult>":$source_strides, "::mlir::Value":$source_bd_id)>,
     // Build a NpuDmaCpyNdOp with static entries.
-    OpBuilder<(ins "Value":$dma, "::mlir::Value":$target,
-      "ArrayRef<int64_t>":$target_offsets, "ArrayRef<int64_t>":$target_sizes,
-      "ArrayRef<int64_t>":$target_strides, "::mlir::Value":$target_bd_id,
-      "::mlir::Value":$source, "ArrayRef<int64_t>":$source_offsets, 
-      "ArrayRef<int64_t>":$source_sizes, "ArrayRef<int64_t>":$source_strides,
-      "::mlir::Value":$source_bd_id)>,
+    OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
+      "::mlir::Value":$target, "ArrayRef<int64_t>":$target_offsets, 
+      "ArrayRef<int64_t>":$target_sizes, "ArrayRef<int64_t>":$target_strides,
+      "::mlir::Value":$target_bd_id, "::mlir::Value":$source,
+      "ArrayRef<int64_t>":$source_offsets, "ArrayRef<int64_t>":$source_sizes,
+      "ArrayRef<int64_t>":$source_strides, "::mlir::Value":$source_bd_id)>,
     // Build a NpuDmaCpyNdOp with dynamic entries.
-    OpBuilder<(ins "Value":$dma, "::mlir::Value":$target,
-      "ValueRange":$target_offsets, "ValueRange":$target_sizes,
-      "ValueRange":$target_strides, "::mlir::Value":$target_bd_id,
-      "::mlir::Value":$source, "ValueRange":$source_offsets,
-      "ValueRange":$source_sizes, "ValueRange":$source_strides, 
-      "::mlir::Value":$source_bd_id)>
+    OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$dma,
+      "::mlir::Value":$target, "ValueRange":$target_offsets,
+      "ValueRange":$target_sizes, "ValueRange":$target_strides,
+      "::mlir::Value":$target_bd_id, "::mlir::Value":$source,
+      "ValueRange":$source_offsets, "ValueRange":$source_sizes,
+      "ValueRange":$source_strides, "::mlir::Value":$source_bd_id)>
   ];
 
   let extraClassDeclaration = [{
@@ -702,45 +713,51 @@ def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
 def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
   let summary = "Wait for the Npu DMA operation to complete.";
   let description = [{
-    The wait operation will block on the referenced Npu DMA operation to complete
-    execution on the provided `direction`. The `S2MM` direction will block on the
-    destination side of the dma operation, ensuring complete execution. The
-    `MM2S` direction will block on the source side of the dma operation,
-    ensuring that the DMA has successfully started execution, but not
-    guaranteeing that all data has been received on the destination side.
+    The wait operation will block on the referenced dependent ops.
+
+    If a dependent op returns a `!amdaie.async_token`, this wait op will block
+    on the dependent op having completed execution.
+    If a dependent op returns a `!amdaie.async_source_token`, this wait op will
+    block on the source side of the referenced dependent op having completed
+    execution.
+    If a dependent op returns a `!amdaie.async_target_token`, this wait op will
+    block on the target side of the referenced dependent op having completed
+    execution.
+
+    Being able to block on the source and/or target side separately is useful
+    for copy/dma-like operations that involve multiple physical ports/channels
+    in hardware. In this case, blocking on the source side and/or target side,
+    might be different from blocking on the entire operation.
 
     Example:
 
     ```mlir
-    %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [%c0, %c0] [%c32, %c64] [%c1024, %c1])
-    amdaie.npu.dma_wait(%2, MM2S)
+    %2 = amdaie.npu.dma_cpy_nd async_source %0([] [] [], [%c0, %c0] [%c32, %c64] [%c1024, %c1])
+    amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
     ```
 
     Here, the `dma_wait` operation will wait until the referenced Npu DMA
-    operation has started execution. On the other hand, the `S2MM` direction can
-    be used to wait on the destination side of the DMA, i.e. until the DMA has
-    finished its write into the target memory:
+    operation has started execution. On the other hand, the 
+    `!amdaie.async_target_token` can be used to wait on the target side of the
+    DMA, i.e. until the DMA has finished its write into the target memory:
 
     ```mlir
-    %2 = amdaie.npu.dma_cpy_nd %0([%c0, %c0] [%c32, %c64] [%c1024, %c1], [] [] [])
-    amdaie.npu.dma_wait(%2, S2MM)
+    %2 = amdaie.npu.dma_cpy_nd async_target %0([%c0, %c0] [%c32, %c64] [%c1024, %c1], [] [] [])
+    amdaie.npu.dma_wait(%2 : !amdaie.async_target_token)
     ```
   }];
 
   let arguments = (
-    ins Index:$dma,
-        DMAChannelDir:$direction
+    ins Variadic<AMDAIE_AnyAsyncTokenType>:$async_tokens
   );
 
   let assemblyFormat = [{
-    `(` $dma `,` $direction `)`  attr-dict
+    (`(` $async_tokens^ `:` type($async_tokens) `)`)? attr-dict
   }];
 
   let extraClassDeclaration = [{
-    // Return the Npu DMA operation argument.
-    NpuDmaCpyNdOp getDmaOp() { 
-      return dyn_cast_if_present<NpuDmaCpyNdOp>(getDma().getDefiningOp());
-    }
+    // Return the Npu DMA operation arguments.
+    SmallVector<NpuDmaCpyNdOp> getDmaOps();
   }];
 }
 
@@ -1073,7 +1090,7 @@ def AMDAIE_LogicalObjectFifoPlaceholderOp:
     amdaie.controlcode {
       %obj1 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0}
         : memref<1024xi32> -> !amdaie.logicalobjectfifo<memref<1024xi32>>
-      %npu_dma = amdaie.npu.dma_cpy_nd %connection([] [] [], 
+      %npu_dma = amdaie.npu.dma_cpy_nd async_source %connection([] [] [], 
         %obj0[%c0, %c32] [%c32, %c32] [%c32, %c1]) 
         : source_type = !amdaie.logicalobjectfifo<memref<1024xi32>>
       amdaie.end

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIETypes.td
@@ -15,6 +15,16 @@ class AMDAIEDialect_Type<string name, string typeMnemonic, list<Trait> traits =
   let mnemonic = typeMnemonic;
 }
 
+// The types for async tokens which can be returned from async operations. The
+// dedicated types for `source` and `target` can be used to specify on which
+// side of (for example a copy/DMA) operation should be synchronized if there
+// are multiple.
+def AMDAIE_AsyncTokenType : AMDAIEDialect_Type<"AsyncToken", "async_token">;
+def AMDAIE_AsyncSourceTokenType : AMDAIEDialect_Type<"AsyncSourceToken", "async_source_token">;
+def AMDAIE_AsyncTargetTokenType : AMDAIEDialect_Type<"AsyncTargetToken", "async_target_token">;
+def AMDAIE_AnyAsyncTokenType 
+  : AnyTypeOf<[AMDAIE_AsyncTokenType, AMDAIE_AsyncSourceTokenType, AMDAIE_AsyncTargetTokenType]>;
+
 def AMDAIE_LogicalObjectFifoType :
     AMDAIEDialect_Type<"LogicalObjectFifo", "logicalobjectfifo"> {
   let summary = "The logical objectfifo type encapsulating a memref";

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/AMDAIEDmaOpInterfaceTest.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/AMDAIEDmaOpInterfaceTest.cpp
@@ -46,7 +46,7 @@ class DmaOpInterfaceTest : public ::testing::Test {
     auto input =
         rewriter.create<arith::ConstantIndexOp>(rewriter.getUnknownLoc(), 2);
     auto dmaOp = rewriter.create<mlir::iree_compiler::AMDAIE::NpuDmaCpyNdOp>(
-        rewriter.getUnknownLoc(), input, target, targetOffsetsOfr,
+        rewriter.getUnknownLoc(), TypeRange{}, input, target, targetOffsetsOfr,
         targetSizesOfr, targetStridesOfr, nullptr, source, sourceOffsetsOfr,
         sourceSizesOfr, sourceStridesOfr, nullptr);
     std::optional<int64_t> sourceStaticSize = dmaOp.getSourceStaticSize();