From 7ed306d937c40469e64ab8d21d43ba489b29ed4c Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 25 Sep 2024 15:00:18 -0600 Subject: [PATCH 01/37] Zero Padding python binding --- include/aie/Dialect/AIE/IR/AIEOps.td | 6 ++++-- python/dialects/aie.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index 59f7731d49..426441cffd 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -1678,7 +1678,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] // via_shared_mem==1 means use consumer tile's memory module OptionalAttr:$via_shared_mem, // memtile_repeat==0 means "do it once" and don't repeat - OptionalAttr:$memtile_repeat + OptionalAttr:$memtile_repeat, + OptionalAttr:$pad_dimensions ); let assemblyFormat = [{ @@ -1717,7 +1718,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] OpBuilder<(ins "mlir::StringAttr":$sym_name, "mlir::Value":$producerTile, "mlir::ValueRange":$consumerTiles, "mlir::Attribute":$elemNumber, "mlir::Type":$elem_type, CArg<"llvm::ArrayRef", "{}">:$dimensionsToStream, - CArg<"llvm::ArrayRef", "{}">:$dimensionsFromStreamPerConsumer), [{ + CArg<"llvm::ArrayRef", "{}">:$dimensionsFromStreamPerConsumer, + CArg<"llvm::ArrayRef", "{}">:$pad_dimensions), [{ odsState.addOperands(producerTile); odsState.addOperands(consumerTiles); odsState.addAttribute(getSymNameAttrName(odsState.name), sym_name); diff --git a/python/dialects/aie.py b/python/dialects/aie.py index 594a4b1e54..556afef7c7 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -104,6 +104,8 @@ def __init__(self, calleeOrResults, inputs=[], input_types=[]): def bd_dim_layout(size, stride): return Attribute.parse(f"#aie.bd_dim_layout<{size=}, {stride=}>") +def bd_pad_layout(const_pad_before, const_pad_after): + return Attribute.parse(f"#aie.bd_pad_layout<{const_pad_before=}, {const_pad_after=}>") @register_attribute_builder("BDDimLayoutArrayAttr") def bd_dim_layout_array_attr_builder( @@ -123,6 +125,16 @@ def bd_dim_layout_array_array_attr_builder(tup_arrs: List[List[tuple]], context= f'#aie', context=context, ) + +@register_attribute_builder("BDPadLayoutArrayAttr") +def bd_pad_layout_array_attr_builder( + tups: List[Union[Attribute, Tuple[int]]], context=None +): + if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups): + tups = list(map(lambda t: bd_pad_layout(*t), tups)) + return Attribute.parse( + f'#aie', context=context + ) @register_attribute_builder("AIEI1Attr") @@ -279,6 +291,7 @@ def __init__( dimensionsFromStreamPerConsumer=None, via_DMA=None, plio=None, + pad_dimensions=None, ): self.datatype = datatype if not isinstance(consumerTiles, List): @@ -287,6 +300,8 @@ def __init__( dimensionsFromStreamPerConsumer = [] if dimensionsToStream is None: dimensionsToStream = [] + if pad_dimensions is None: + pad_dimensions = [] int_ty = IntegerType.get_signless(32) of_Ty = TypeAttr.get(ObjectFifoType.get(datatype)) super().__init__( @@ -299,6 +314,7 @@ def __init__( dimensionsFromStreamPerConsumer=dimensionsFromStreamPerConsumer, via_DMA=via_DMA, plio=plio, + pad_dimensions=pad_dimensions ) def acquire(self, port, num_elem): From 266b50ba2e41ff6e9ca60a2b0db244ec8c91f1b9 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 25 Sep 2024 15:02:47 -0600 Subject: [PATCH 02/37] Padding at python level --- .../memtile_repeat/distribute_repeat/aie2.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py index 602aa75652..e95ba90d9d 100644 --- a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py @@ -10,9 +10,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * +from aie.dialects.scf import * from aie.extras.dialects.ext import arith from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext.scf import _for as range_ dev = AIEDevice.npu1_1col col = 0 @@ -54,7 +54,7 @@ def device_body(): # AIE-array data movement with object fifos of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_in_ty) - of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty) + of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty, pad_dimensions=[(1,0),(1,1),(1,1)]) of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_half_ty) of_in2.set_memtile_repeat(repeat_counter) of_in3.set_memtile_repeat(repeat_counter) @@ -70,28 +70,32 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in range_(sys.maxsize): + for _ in for_(sys.maxsize): elemOut = of_out2.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in2.acquire(ObjectFifoPort.Consume, 1) - for i in range_(N // 2): + for i in for_(N // 2): v0 = memref.load(elemIn, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elemOut, [i]) + yield_([]) of_in2.release(ObjectFifoPort.Consume, 1) of_out2.release(ObjectFifoPort.Produce, 1) + yield_([]) # Compute tile 3 @core(ComputeTile3) def core_body(): - for _ in range_(sys.maxsize): + for _ in for_(sys.maxsize): elemOut = of_out3.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in3.acquire(ObjectFifoPort.Consume, 1) - for i in range_(N // 2): + for i in for_(N // 2): v0 = memref.load(elemIn, [i]) v1 = arith.addi(v0, arith.constant(2, T.i32())) memref.store(v1, elemOut, [i]) + yield_([]) of_in3.release(ObjectFifoPort.Consume, 1) of_out3.release(ObjectFifoPort.Produce, 1) + yield_([]) # To/from AIE-array data movement tensor_out_ty = T.memref(out_size, T.i32()) From 47dfc0a923d9ada3e56cb73f12abd6ba53cdcf85 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 25 Sep 2024 15:04:00 -0600 Subject: [PATCH 03/37] Connecting padding from fifo to dmabd --- .../AIEObjectFifoStatefulTransform.cpp | 59 ++++++++++++------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 388f14a8f6..2d11bc057e 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -456,13 +456,17 @@ struct AIEObjectFifoStatefulTransformPass void createBd(OpBuilder &builder, LockOp acqLock, int acqMode, LockAction acqLockAction, LockOp relLock, int relMode, MyOp buff, int offset, int len, Block *succ, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - if (!dims.getValue().empty()) - builder.create(builder.getUnknownLoc(), buff, offset, len, dims); - else - builder.create(builder.getUnknownLoc(), buff, offset, len); + if (!dims.getValue().empty()){ + builder.create(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); + std::cout<<"With pads dmabd"<(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); + dmas.dump(); + std::cout<<"Without pads dmabd"<(builder.getUnknownLoc(), relLock, LockAction::Release, relMode); @@ -476,7 +480,7 @@ struct AIEObjectFifoStatefulTransformPass void createBdBlock(OpBuilder &builder, ObjectFifoCreateOp op, int lockMode, int acqNum, int relNum, MyOp buff, int offset, int len, DMAChannelDir channelDir, size_t blockIndex, Block *succ, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { LockOp acqLock; LockOp relLock; int acqMode = 1; @@ -498,22 +502,36 @@ struct AIEObjectFifoStatefulTransformPass relLock = channelDir == DMAChannelDir::S2MM ? locksPerFifo[op][1] : locksPerFifo[op][0]; } + std::cout<<"BD Block"<(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], /*offset*/ 0, - len, channelDir, blockIndex, succ, dims); + len, channelDir, blockIndex, succ, dims, nullptr); curr = succ; blockIndex++; } @@ -672,7 +690,7 @@ struct AIEObjectFifoStatefulTransformPass createBdBlock(builder, op, lockMode, acqNum, relNum, externalBuffersPerFifo[op][blockIndex], /*offset*/ 0, len, channelDir, blockIndex, - succ, dims); + succ, dims, nullptr); curr = succ; blockIndex++; } @@ -683,7 +701,7 @@ struct AIEObjectFifoStatefulTransformPass void createMemTileDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op, DMAChannelDir channelDir, int channelIndex, int lockMode, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { size_t numBlocks = op.size(); if (numBlocks == 0) return; @@ -704,6 +722,7 @@ struct AIEObjectFifoStatefulTransformPass dims.getValue().drop_front(1)); } } + if (op.getMemtileRepeat().has_value()) repeatCount = op.getMemtileRepeat().value(); @@ -831,9 +850,10 @@ struct AIEObjectFifoStatefulTransformPass int offset = 0; if (isDistribute || isJoin) offset = extraOffset; - createBdBlock(builder, target, lockMode, acqNum, relNum, + std::cout<<"MEmTile DMA"<(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], offset, - lenOut, channelDir, blockIndex, succ, dims); + lenOut, channelDir, blockIndex, succ, dims, padDimensions); curr = succ; blockIndex++; } @@ -1129,7 +1149,6 @@ struct AIEObjectFifoStatefulTransformPass auto consumerWireType = WireBundle::DMA; std::set objectFifoTiles; // track cores to check for loops during unrolling - //===------------------------------------------------------------------===// // Split objectFifos into a consumer end and producer end if needed //===------------------------------------------------------------------===// @@ -1268,7 +1287,7 @@ struct AIEObjectFifoStatefulTransformPass DMAChannel producerChan = dmaAnalysis.getMasterDMAChannel(producer.getProducerTile()); createDMA(device, builder, producer, producerChan.direction, - producerChan.channel, 0, producer.getDimensionsToStreamAttr()); + producerChan.channel, 0, producer.getDimensionsToStreamAttr(), nullptr); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); @@ -1286,7 +1305,7 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, - consumerChan.channel, 1, consumerDims); + consumerChan.channel, 1, consumerDims, nullptr); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); @@ -1535,4 +1554,4 @@ struct AIEObjectFifoStatefulTransformPass std::unique_ptr> AIE::createAIEObjectFifoStatefulTransformPass() { return std::make_unique(); -} +} \ No newline at end of file From 221507e2efe8de8bb9a47308d0534dfdb99976c5 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 25 Sep 2024 17:15:05 -0600 Subject: [PATCH 04/37] Padding --- .../AIEObjectFifoStatefulTransform.cpp | 22 ++++--------------- .../memtile_repeat/distribute_repeat/aie2.py | 2 +- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 2d11bc057e..60d24026af 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -461,11 +461,8 @@ struct AIEObjectFifoStatefulTransformPass acqMode); if (!dims.getValue().empty()){ builder.create(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); - std::cout<<"With pads dmabd"<(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); - dmas.dump(); - std::cout<<"Without pads dmabd"<(builder.getUnknownLoc(), buff, offset, len); } builder.create(builder.getUnknownLoc(), relLock, @@ -502,7 +499,6 @@ struct AIEObjectFifoStatefulTransformPass relLock = channelDir == DMAChannelDir::S2MM ? locksPerFifo[op][1] : locksPerFifo[op][0]; } - std::cout<<"BD Block"<(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], offset, lenOut, channelDir, blockIndex, succ, dims, padDimensions); @@ -1287,7 +1273,7 @@ struct AIEObjectFifoStatefulTransformPass DMAChannel producerChan = dmaAnalysis.getMasterDMAChannel(producer.getProducerTile()); createDMA(device, builder, producer, producerChan.direction, - producerChan.channel, 0, producer.getDimensionsToStreamAttr(), nullptr); + producerChan.channel, 0, producer.getDimensionsToStreamAttr(), producer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); @@ -1305,7 +1291,7 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, - consumerChan.channel, 1, consumerDims, nullptr); + consumerChan.channel, 1, consumerDims, consumer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py index e95ba90d9d..ea3903f2b9 100644 --- a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py @@ -54,7 +54,7 @@ def device_body(): # AIE-array data movement with object fifos of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_in_ty) - of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty, pad_dimensions=[(1,0),(1,1),(1,1)]) + of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty, dimensionsToStream=[(16,2)], pad_dimensions=[(2,0)]) of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_half_ty) of_in2.set_memtile_repeat(repeat_counter) of_in3.set_memtile_repeat(repeat_counter) From d7585b8d31337ca67745384a15b4a9b43d702c79 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 26 Sep 2024 10:11:13 -0600 Subject: [PATCH 05/37] Slight adjustment --- .../Transforms/AIEObjectFifoStatefulTransform.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 60d24026af..0a5e465e2e 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -459,9 +459,15 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - if (!dims.getValue().empty()){ - builder.create(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); - } else{ + + if (!dims.getValue().empty() && !padDimensions.getValue().empty()){ + auto dmas = builder.create(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); + dmas.dump(); + } + else if (!dims.getValue().empty()){ + builder.create(builder.getUnknownLoc(), buff, offset, len, dims); + } + else{ builder.create(builder.getUnknownLoc(), buff, offset, len); } @@ -512,7 +518,7 @@ struct AIEObjectFifoStatefulTransformPass createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } else if (op.getProducerTileOp().isMemTile() && channelDir == DMAChannelDir::MM2S) { - createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, + createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, pad_dims); } else if (op.getProducerTileOp().isMemTile() && channelDir == DMAChannelDir::S2MM){ createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, From ae7dc3cd3e24c9565fa5be42dfb6a21f265adc3b Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 26 Sep 2024 10:16:44 -0600 Subject: [PATCH 06/37] Test cases --- .../basic/memtile_repeat/distribute_repeat/aie2.py | 2 +- programming_examples/basic/passthrough_dmas/aie2.py | 12 +++--------- programming_examples/basic/passthrough_dmas/test.cpp | 1 + 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py index ea3903f2b9..5ec5dd3ba2 100644 --- a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py @@ -55,7 +55,7 @@ def device_body(): # AIE-array data movement with object fifos of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_in_ty) of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty, dimensionsToStream=[(16,2)], pad_dimensions=[(2,0)]) - of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_half_ty) + of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_half_ty, dimensionsToStream=[(10,2)], pad_dimensions=[(2,2)]) of_in2.set_memtile_repeat(repeat_counter) of_in3.set_memtile_repeat(repeat_counter) object_fifo_link(of_in, [of_in2, of_in3], [], [0, N // 2]) diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py index 4bd9266c2c..7837411804 100644 --- a/programming_examples/basic/passthrough_dmas/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -41,21 +41,15 @@ def device_body(): # Tile declarations ShimTile = tile(col, 0) - ComputeTile2 = tile(col, 2) + MemTile = tile(col, 1) # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty) - of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) + of_in = object_fifo("in", ShimTile, MemTile, 2, memRef_ty) + of_out = object_fifo("out", MemTile, ShimTile, 2, memRef_ty, dimensionsToStream=[(16,2)], pad_dimensions=[(2,0)]) object_fifo_link(of_in, of_out) # Set up compute tiles - # Compute tile 2 - @core(ComputeTile2) - def core_body(): - for _ in range_(sys.maxsize): - pass - # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp index 9c11596119..bdb360da81 100644 --- a/programming_examples/basic/passthrough_dmas/test.cpp +++ b/programming_examples/basic/passthrough_dmas/test.cpp @@ -179,6 +179,7 @@ int main(int argc, const char *argv[]) { uint32_t ref = (i + 1); if (*(bufOut + i) != ref) { errors++; + std::cout<<*(bufOut+i)<<", "< Date: Tue, 1 Oct 2024 15:27:11 -0600 Subject: [PATCH 07/37] Revert changes --- .../basic/memtile_repeat/distribute_repeat/aie2.py | 13 ++++++------- .../basic/passthrough_dmas/aie2.py | 14 ++++++++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py index 9d5bc1104c..268a0a6824 100644 --- a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py @@ -11,6 +11,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ dev = AIEDevice.npu1_1col col = 0 @@ -52,8 +53,8 @@ def device_body(): # AIE-array data movement with object fifos of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_in_ty) - of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty, dimensionsToStream=[(16,2)], pad_dimensions=[(2,0)]) - of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_half_ty, dimensionsToStream=[(10,2)], pad_dimensions=[(2,2)]) + of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_half_ty) + of_in3 = object_fifo("in3", MemTile, ComputeTile3, 2, memRef_half_ty) of_in2.set_memtile_repeat(repeat_counter) of_in3.set_memtile_repeat(repeat_counter) object_fifo_link(of_in, [of_in2, of_in3], [], [0, N // 2]) @@ -68,26 +69,24 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemOut = of_out2.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in2.acquire(ObjectFifoPort.Consume, 1) for i in range_(N // 2): elemOut[i] = elemIn[i] + 1 of_in2.release(ObjectFifoPort.Consume, 1) of_out2.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile3) def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemOut = of_out3.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in3.acquire(ObjectFifoPort.Consume, 1) for i in range_(N // 2): elemOut[i] = elemIn[i] + 2 of_in3.release(ObjectFifoPort.Consume, 1) of_out3.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_out_ty = T.memref(out_size, T.i32()) @@ -105,4 +104,4 @@ def sequence(A, B, C): print(ctx.module) -distribute_repeat() +distribute_repeat() \ No newline at end of file diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py index c139b8284a..b1fca34482 100644 --- a/programming_examples/basic/passthrough_dmas/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -41,15 +41,21 @@ def device_body(): # Tile declarations ShimTile = tile(col, 0) - MemTile = tile(col, 1) + ComputeTile2 = tile(col, 2) # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, MemTile, 2, memRef_ty) - of_out = object_fifo("out", MemTile, ShimTile, 2, memRef_ty, dimensionsToStream=[(16,2)], padDimensions=[(2,0)]) + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) object_fifo_link(of_in, of_out) # Set up compute tiles + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + for _ in range_(sys.maxsize): + pass + # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) @@ -64,4 +70,4 @@ def sequence(A, B, C): print(ctx.module) -my_passthrough() +my_passthrough() \ No newline at end of file From 5b39c9ea07b75f6216fdea7047152268710847b6 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Tue, 1 Oct 2024 15:48:46 -0600 Subject: [PATCH 08/37] Runtime zero padding on MemTile --- include/aie/Dialect/AIEX/IR/AIEX.td | 24 +++++++++++-- .../AIEX/Transforms/AIECtrlPacketToDma.cpp | 2 +- .../AIEX/Transforms/AIEDMATasksToNPU.cpp | 4 ++- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 34 ++++++++++++++++++- 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index ce04f131e0..489bef7695 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -570,7 +570,13 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ OptionalAttr:$packet, FlatSymbolRefAttr:$metadata, I64Attr:$id, - DefaultValuedOptionalAttr:$issue_token + DefaultValuedOptionalAttr:$issue_token, + DefaultValuedOptionalAttr:$d0_zero_before, + DefaultValuedOptionalAttr:$d1_zero_before, + DefaultValuedOptionalAttr:$d2_zero_before, + DefaultValuedOptionalAttr:$d0_zero_after, + DefaultValuedOptionalAttr:$d1_zero_after, + DefaultValuedOptionalAttr:$d2_zero_after ); let assemblyFormat = [{ @@ -840,7 +846,13 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> { I32Attr:$lock_rel_id, I32Attr:$lock_acq_enable, I32Attr:$lock_acq_val, - I32Attr:$lock_acq_id + I32Attr:$lock_acq_id, + I32Attr:$d0_zero_before, + I32Attr:$d1_zero_before, + I32Attr:$d2_zero_before, + I32Attr:$d0_zero_after, + I32Attr:$d1_zero_after, + I32Attr:$d2_zero_after ); let results = (outs ); let assemblyFormat = [{ attr-dict }]; @@ -865,7 +877,13 @@ def AIE_DMAConfigureTaskOp : AIEX_Op<"dma_configure_task", [HasParent<"RuntimeSe DMAChannelDir:$direction, I32Attr:$channel, DefaultValuedOptionalAttr:$issue_token, - DefaultValuedOptionalAttr:$repeat_count + DefaultValuedOptionalAttr:$repeat_count, + DefaultValuedOptionalAttr:$d0_zero_before, + DefaultValuedOptionalAttr:$d1_zero_before, + DefaultValuedOptionalAttr:$d2_zero_before, + DefaultValuedOptionalAttr:$d0_zero_after, + DefaultValuedOptionalAttr:$d1_zero_after, + DefaultValuedOptionalAttr:$d2_zero_after ); let regions = ( diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index f56f1cee3e..c064ad6702 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -129,7 +129,7 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase { SmallVector{}, SmallVector{}, SmallVector{}, ArrayRef(staticOffsets), ArrayRef(staticSizes), ArrayRef(staticStrides), - controllerIdPkt, metadata, 0, true); + controllerIdPkt, metadata, 0, true, 0, 0, 0, 0, 0, 0); auto shimRow = builder.getI32IntegerAttr(0); auto shimCol = builder.getI32IntegerAttr(col); diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 8c889553da..5e3d6dc3b1 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -316,7 +316,9 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /*valid_bd=*/1, /* TODO: Locks */ /*lock_rel_val=*/0, /*lock_rel_id=*/0, /*lock_acq_enable=*/0, - /*lock_acq_val=*/0, /*lock_ackq_id=*/0); + /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/0, + /*d1_zero_before=*/0, /*d2_zero_before=*/0, /*d0_zero_after=*/0, + /*d1_zero_after=*/0, /*d2_zero_after=*/0); return setAddressForSingleBD(builder, bd_op, tile); } diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index b9390d6e59..5978f77b47 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -338,6 +338,12 @@ struct DmaToNpuPattern : OpConversionPattern { auto lock_acq_enable = zero; auto lock_acq_val = zero; auto lock_acq_id = zero; + auto d0_zero_before = zero; + auto d1_zero_before = zero; + auto d2_zero_before = zero; + auto d0_zero_after = zero; + auto d1_zero_after = zero; + auto d2_zero_after = zero; auto issue_token = BoolAttr::get(ctx, false); auto repeat_count = zero; @@ -448,6 +454,24 @@ struct DmaToNpuPattern : OpConversionPattern { // lock_acq_id + // d0_zero_before + d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore()); + + // d1_zero_before + d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore()); + + // d2_zero_before + d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore()); + + // d0_zero_after + d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter()); + + // d1_zero_after + d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter()); + + // d2_zero_after + d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter()); + // Set the issue_token issue_token = BoolAttr::get(ctx, op.getIssueToken()); // Earlier, all S2MM channels were implicitly assumed to issue a token. @@ -460,7 +484,9 @@ struct DmaToNpuPattern : OpConversionPattern { enable_packet, out_of_order_id, packet_id, packet_type, d0_size, d0_stride, d1_size, d1_stride, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, - lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id); + lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id, + d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, + d1_zero_after, d2_zero_after); uint64_t addr = getBufferDescriptorAddressRegisterAddress( targetModel, op.getId(), col, 0); @@ -593,6 +619,7 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[0] |= op.getBufferLength() & 0x1ffff; // DMA_BDX_1 + words[1] |= (op.getD0ZeroBefore() & 0x3F) << 26; words[1] |= (op.getNextBd() & 0x3f) << 20; words[1] |= (op.getUseNextBd() & 0x1) << 19; words[1] |= op.getBufferOffset() & 0x7ffff; @@ -603,15 +630,20 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { // DMA_BDX_3 // TODO: Secure Access + words[3] |= (op.getD1ZeroBefore() & 0x1F) << 27; words[3] |= (op.getD1Size() & 0x3ff) << 17; words[3] |= op.getD1Stride() & 0x1ffff; // DMA_BDX_4 // TODO: D2Size + words[4] |= (op.getD2ZeroBefore() & 0xF) << 27; words[4] |= op.getD2Stride() & 0x1ffff; // DMA_BDX_5 // ToDO: D3Stride + words[5] |= (op.getD2ZeroAfter() & 0xF) << 28; + words[5] |= (op.getD1ZeroAfter() & 0x1F) << 23; + words[5] |= (op.getD0ZeroAfter() & 0x3F) << 17; // DMA_BDX_6 words[6] |= (op.getIterationCurrent() & 0x3f) << 23; From 496d3a21bc1c188c9cc8bb4c1cdb525fb083607f Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 9 Oct 2024 15:38:12 -0600 Subject: [PATCH 09/37] Padding through dma-tasks-to-npu --- .../AIEX/Transforms/AIEDMATasksToNPU.cpp | 49 ++++++++++++++++--- .../dma-tasks-to-npu/bad-12.mlir | 26 ++++++++++ .../dma-tasks-to-npu/bad-13.mlir | 27 ++++++++++ .../dma-tasks-to-npu/bad-14.mlir | 27 ++++++++++ .../dma-tasks-to-npu/bad-15.mlir | 28 +++++++++++ .../dma-tasks-to-npu/bad-16.mlir | 27 ++++++++++ .../dma-tasks-to-npu/good-7.mlir | 32 ++++++++++++ .../dma-tasks-to-npu/good-8.mlir | 28 +++++++++++ 8 files changed, 237 insertions(+), 7 deletions(-) create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir create mode 100644 test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 5e3d6dc3b1..124855ed7d 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -216,7 +216,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { } LogicalResult rewriteSingleBD(OpBuilder &builder, Block &block, - AIE::TileOp &tile) { + AIE::TileOp &tile, AIE::DMAChannelDir channelDir) { AIE::DMABDOp bd_op = getBdForBlock(block); const auto &target_model = AIE::getTargetModel(bd_op); MemRefType buffer_type = bd_op.getBuffer().getType(); @@ -237,12 +237,19 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { << len << " bytes falls below minimum hardware transfer unit of " << (addr_granularity / 8) << " bytes."; } - // Process strides/wraps std::optional> dims = bd_op.getDimensions(); llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); + // Padding + std::optional> padDims = + bd_op.getPadDimensions(); + llvm::SmallVector padBefore = llvm::SmallVector(4, 0); + llvm::SmallVector padAfter = llvm::SmallVector(4, 0); + std::fill(padBefore.begin(), padBefore.end(), 0); + std::fill(padAfter.begin(), padAfter.end(), 0); + if (dims && dims->size() > 0) { llvm::SmallVector input_sizes = llvm::SmallVector(4, 1); @@ -260,6 +267,22 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { input_sizes[i] = (*dims)[j].getSize(); input_strides[i] = (*dims)[j].getStride(); } + + if(target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S){ + if(padDims && (padDims->size() > dims->size())) + return bd_op->emitOpError() << "Mismatch number of dimensions between padding(s)" + << " and wrap(s) and stride(s)."; + else if (padDims) + for (size_t i = 0; i < padDims->size(); i++) { + int j = padDims->size() - i - 1; + padBefore[i] = (*padDims)[j].getConstPadBefore(); + padAfter[i] = (*padDims)[j].getConstPadAfter(); + } + } + else{ + return bd_op->emitOpError() << "supports padding only for MM2S direction on MemTiles."; + } getHardwareStridesWraps(target_model, buffer_type, input_sizes, input_strides, sizes, strides); if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(), @@ -291,7 +314,16 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { return failure(); } } - + else{ + if(padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S){ + return bd_op->emitOpError() << "Padding requires n-d data layouts expressed as" + << "wrap(s) and stride(s)."; + } + else if (padDims){ + return bd_op->emitOpError() << "Padding is supported only on MemTiles."; + } + } // find next BD ID, if any uint32_t use_next_bd = 0; uint32_t next_bd_id = 0; @@ -316,9 +348,10 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /*valid_bd=*/1, /* TODO: Locks */ /*lock_rel_val=*/0, /*lock_rel_id=*/0, /*lock_acq_enable=*/0, - /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/0, - /*d1_zero_before=*/0, /*d2_zero_before=*/0, /*d0_zero_after=*/0, - /*d1_zero_after=*/0, /*d2_zero_after=*/0); + /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/padBefore[0], + /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2], + /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1], + /*d2_zero_after=*/padAfter[2]); return setAddressForSingleBD(builder, bd_op, tile); } @@ -393,6 +426,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { if (failed(hoistNextBdOpsIntoAttrs(op))) { return failure(); } + + auto channelDir = op.getDirection(); // Lower all BDs for (auto it = body.begin(); it != body.end(); ++it) { @@ -400,7 +435,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { if (shouldSkipBlock(block)) { continue; } - if (failed(rewriteSingleBD(builder, block, tile))) { + if (failed(rewriteSingleBD(builder, block, tile, channelDir))) { return failure(); } } diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir new file mode 100644 index 0000000000..793114ef49 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} + + %t1 = aiex.dma_configure_task(%tile_0_1, S2MM, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir new file mode 100644 index 0000000000..734ed663fd --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir @@ -0,0 +1,27 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} + + %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir new file mode 100644 index 0000000000..68694921b9 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir @@ -0,0 +1,27 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} + + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir new file mode 100644 index 0000000000..8ab96d21ef --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // expected-error@+1 {{Mismatch number of dimensions between padding(s) and wrap(s) and stride(s).}} + + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], [, ]) + {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir new file mode 100644 index 0000000000..752d30caa8 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir @@ -0,0 +1,27 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // expected-error@+1 {{Padding is supported only on MemTiles.}} + + %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir new file mode 100644 index 0000000000..0bde6c923a --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir @@ -0,0 +1,32 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +// This test ensures that a buffer descriptor configuration that references a buffer +// on a mem tile gets lowered to the correct NPU instruction sequence register write +// setting that BD's address. + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 2 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir new file mode 100644 index 0000000000..6baa6be36a --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_stride = 0 : i32, d2_zero_after = 1 : i32, d2_zero_before = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], [, , ]) {bd_id = 0 : i32} + aie.end + } + } + } +} + From 468c1950a6bb0979f14879b45587c31ff59ce421 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 9 Oct 2024 17:31:50 -0600 Subject: [PATCH 10/37] ObjectFifo example --- .../memtile_padding_test.mlir | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 test/objectFifo-stateful-transform/memtile_padding_test.mlir diff --git a/test/objectFifo-stateful-transform/memtile_padding_test.mlir b/test/objectFifo-stateful-transform/memtile_padding_test.mlir new file mode 100644 index 0000000000..9a9651e664 --- /dev/null +++ b/test/objectFifo-stateful-transform/memtile_padding_test.mlir @@ -0,0 +1,52 @@ +//===- memtile_padding_test.mlir --------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + aie.objectfifo @objFifo_in0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ([] []) + aie.objectfifo @objFifo_out1(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%tile_0_1 dimensionsToStream [, ], {%tile_0_0}, 2 : i32) {padDimensions = #aie, ]>} : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ([] []) + %core_0_2 = aie.core(%tile_0_2) { + %subview = aie.objectfifo.acquire @objFifo_in1 (Consume, 1) : !aie.objectfifosubview> + %subview1 = aie.objectfifo.acquire @objFifo_out1 (Produce, 1) : !aie.objectfifosubview> + %elem = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<64x64xi8> + %elem1 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<64x64xi8> + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c12_i8 = arith.constant 12 : i8 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %elem[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %elem1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.objectfifo.release @objFifo_in1 (Consume, 1) + aie.objectfifo.release @objFifo_out1 (Produce, 1) + aie.end + } + + aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + } +} \ No newline at end of file From 28e5ea095548328ba5353a3ff5ade7bcc8f09ace Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 16 Oct 2024 09:36:17 -0600 Subject: [PATCH 11/37] Example code and DmaTasksToNpu --- include/aie/Dialect/AIEX/IR/AIEX.td | 8 +-- .../AIEObjectFifoStatefulTransform.cpp | 43 +++++++------ lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 11 ++++ test/python/zero_pad.py | 62 +++++++++++++++++++ 4 files changed, 98 insertions(+), 26 deletions(-) create mode 100644 test/python/zero_pad.py diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index 489bef7695..6e7d9968df 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -877,13 +877,7 @@ def AIE_DMAConfigureTaskOp : AIEX_Op<"dma_configure_task", [HasParent<"RuntimeSe DMAChannelDir:$direction, I32Attr:$channel, DefaultValuedOptionalAttr:$issue_token, - DefaultValuedOptionalAttr:$repeat_count, - DefaultValuedOptionalAttr:$d0_zero_before, - DefaultValuedOptionalAttr:$d1_zero_before, - DefaultValuedOptionalAttr:$d2_zero_before, - DefaultValuedOptionalAttr:$d0_zero_after, - DefaultValuedOptionalAttr:$d1_zero_after, - DefaultValuedOptionalAttr:$d2_zero_after + DefaultValuedOptionalAttr:$repeat_count ); let regions = ( diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 8bd0221744..555327a65d 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -459,16 +459,15 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - - if (!dims.getValue().empty() && !padDimensions.getValue().empty()){ - builder.create(builder.getUnknownLoc(), buff, offset, len, dims, padDimensions); - } - else if (!dims.getValue().empty()){ + + if (!dims.getValue().empty() && !padDimensions.getValue().empty()) { + builder.create(builder.getUnknownLoc(), buff, offset, + len, dims, padDimensions); + } else if (!dims.getValue().empty()) { builder.create(builder.getUnknownLoc(), buff, offset, len, dims); - } - else{ + } else { builder.create(builder.getUnknownLoc(), buff, offset, len); - } + } builder.create(builder.getUnknownLoc(), relLock, LockAction::Release, relMode); @@ -482,7 +481,8 @@ struct AIEObjectFifoStatefulTransformPass void createBdBlock(OpBuilder &builder, ObjectFifoCreateOp op, int lockMode, int acqNum, int relNum, MyOp buff, int offset, int len, DMAChannelDir channelDir, size_t blockIndex, Block *succ, - BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { + BDDimLayoutArrayAttr dims, + BDPadLayoutArrayAttr padDimensions) { LockOp acqLock; LockOp relLock; int acqMode = 1; @@ -516,14 +516,15 @@ struct AIEObjectFifoStatefulTransformPass if (op.getProducerTileOp().isShimTile()) { createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); - } else if (op.getProducerTileOp().isMemTile() && channelDir == DMAChannelDir::MM2S) { - createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, + } else if (op.getProducerTileOp().isMemTile() && + channelDir == DMAChannelDir::MM2S) { + createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, pad_dims); - } else if (op.getProducerTileOp().isMemTile() && channelDir == DMAChannelDir::S2MM){ + } else if (op.getProducerTileOp().isMemTile() && + channelDir == DMAChannelDir::S2MM) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, nullptr); - } - else { + } else { createAIETileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } @@ -693,7 +694,8 @@ struct AIEObjectFifoStatefulTransformPass void createMemTileDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op, DMAChannelDir channelDir, int channelIndex, int lockMode, - BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { + BDDimLayoutArrayAttr dims, + BDPadLayoutArrayAttr padDimensions) { size_t numBlocks = op.size(); if (numBlocks == 0) return; @@ -842,9 +844,10 @@ struct AIEObjectFifoStatefulTransformPass int offset = 0; if (isDistribute || isJoin) offset = extraOffset; - createBdBlock(builder, target, lockMode, acqNum, relNum, + createBdBlock(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], offset, - lenOut, channelDir, blockIndex, succ, dims, padDimensions); + lenOut, channelDir, blockIndex, succ, dims, + padDimensions); curr = succ; blockIndex++; } @@ -1278,7 +1281,8 @@ struct AIEObjectFifoStatefulTransformPass DMAChannel producerChan = dmaAnalysis.getMasterDMAChannel(producer.getProducerTile()); createDMA(device, builder, producer, producerChan.direction, - producerChan.channel, 0, producer.getDimensionsToStreamAttr(), producer.getPadDimensionsAttr()); + producerChan.channel, 0, producer.getDimensionsToStreamAttr(), + producer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); @@ -1296,7 +1300,8 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, - consumerChan.channel, 1, consumerDims, consumer.getPadDimensionsAttr()); + consumerChan.channel, 1, consumerDims, + consumer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 5978f77b47..90d14c5511 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -363,6 +363,9 @@ struct DmaToNpuPattern : OpConversionPattern { // column column = IntegerAttr::get(i32ty, col); + // row + row = IntegerAttr::get(i32ty, 0); + // arg_idx AIEX::RuntimeSequenceOp seq_op = op->getParentOfType(); @@ -479,6 +482,10 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); + // TODO: Need to add a check to only allow zero padding on MM2S channel of MemTile + // As of now, run time MemTile DMA configuration is supported only from BD level, not at + // NpuDmaMemcpyNdOp. + rewriter.create( op->getLoc(), column, bd_id, buffer_length, buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, d0_size, @@ -608,6 +615,10 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[7] |= (op.getLockAcqEnable() & 0x1) << 12; words[7] |= (op.getLockAcqVal() & 0xef) << 5; words[7] |= op.getLockAcqId() & 0xf; + + if(op.getD0ZeroBefore() || op.getD1ZeroBefore() || op.getD2ZeroBefore() || op.getD0ZeroAfter() || op.getD1ZeroAfter() || op.getD2ZeroAfter()){ + op->emitError("Zero padding is only available on MemTile"); + } } else if (tm.isMemTile(op.getColumn(), op.getRow())) { bd_addr = (op.getColumn() << tm.getColumnShift()) | (op.getRow() << tm.getRowShift()) | (0xA0000 + bd_id * 0x20); diff --git a/test/python/zero_pad.py b/test/python/zero_pad.py new file mode 100644 index 0000000000..79fb569681 --- /dev/null +++ b/test/python/zero_pad.py @@ -0,0 +1,62 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# RUN: %python %s | FileCheck %s +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ + +N = 56 +dev = AIEDevice.npu1_1col +col = 0 + +if len(sys.argv) > 1: + N = int(sys.argv[1]) + +if len(sys.argv) > 2: + if sys.argv[2] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[2] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2])) + +if len(sys.argv) > 3: + col = int(sys.argv[3]) + + +def my_passthrough(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(25, T.i32()) + memRef_ty2 = T.memref(56, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + MemTile = tile(col, 1) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_ty) + of_out = object_fifo("out", MemTile, ShimTile, 1, memRef_ty2, dimensionsToStream=[(5, 5), (5, 5)], padDimensions=[(2, 0), (3, 0)]) + object_fifo_link(of_in, of_out) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True + ) + npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N]) + dma_wait(of_in, of_out) + + print(ctx.module) + + +my_passthrough() \ No newline at end of file From fced7a93fce7105e9e2eb83d299c6b5b73fd6b70 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 16 Oct 2024 09:45:59 -0600 Subject: [PATCH 12/37] Removed comment --- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir index 0bde6c923a..a6b9fa6bbb 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir @@ -7,10 +7,6 @@ // RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s -// This test ensures that a buffer descriptor configuration that references a buffer -// on a mem tile gets lowered to the correct NPU instruction sequence register write -// setting that BD's address. - module { aie.device(npu1_4col) { %tile_0_0 = aie.tile(0, 0) From 613617f3cb89709a28d6e12214adab60eca9019b Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:31:20 -0600 Subject: [PATCH 13/37] clang format --- lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index d8cf388e92..535912a9ff 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -465,8 +465,8 @@ struct AIEObjectFifoStatefulTransformPass acqMode); if (!dims.getValue().empty() && !padDimensions.getValue().empty()) { - builder.create(builder.getUnknownLoc(), buff, offset, - len, dims, padDimensions); + builder.create(builder.getUnknownLoc(), buff, offset, len, dims, + padDimensions); } else if (!dims.getValue().empty()) { builder.create(builder.getUnknownLoc(), buff, offset, len, dims); } else { From e05429bc9f7e1312e14b799dfed41a923f751828 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:34:18 -0600 Subject: [PATCH 14/37] Python format --- test/python/zero_pad.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/python/zero_pad.py b/test/python/zero_pad.py index 79fb569681..807a4995ad 100644 --- a/test/python/zero_pad.py +++ b/test/python/zero_pad.py @@ -42,7 +42,15 @@ def device_body(): # AIE-array data movement with object fifos of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_ty) - of_out = object_fifo("out", MemTile, ShimTile, 1, memRef_ty2, dimensionsToStream=[(5, 5), (5, 5)], padDimensions=[(2, 0), (3, 0)]) + of_out = object_fifo( + "out", + MemTile, + ShimTile, + 1, + memRef_ty2, + dimensionsToStream=[(5, 5), (5, 5)], + padDimensions=[(2, 0), (3, 0)], + ) object_fifo_link(of_in, of_out) # To/from AIE-array data movement @@ -59,4 +67,4 @@ def sequence(A, B, C): print(ctx.module) -my_passthrough() \ No newline at end of file +my_passthrough() From 6e402393f06f8f10f3fb242bbdee4bc6ac319d98 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:40:25 -0600 Subject: [PATCH 15/37] Remove unnecessary changes --- programming_examples/basic/passthrough_dmas/test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp index bdb360da81..3e227310cf 100644 --- a/programming_examples/basic/passthrough_dmas/test.cpp +++ b/programming_examples/basic/passthrough_dmas/test.cpp @@ -179,7 +179,6 @@ int main(int argc, const char *argv[]) { uint32_t ref = (i + 1); if (*(bufOut + i) != ref) { errors++; - std::cout<<*(bufOut+i)<<", "< Date: Wed, 23 Oct 2024 11:42:17 -0600 Subject: [PATCH 16/37] Clang format --- .../AIEX/Transforms/AIEDMATasksToNPU.cpp | 49 ++++++++++--------- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 12 +++-- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 124855ed7d..117ff51518 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -216,7 +216,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { } LogicalResult rewriteSingleBD(OpBuilder &builder, Block &block, - AIE::TileOp &tile, AIE::DMAChannelDir channelDir) { + AIE::TileOp &tile, + AIE::DMAChannelDir channelDir) { AIE::DMABDOp bd_op = getBdForBlock(block); const auto &target_model = AIE::getTargetModel(bd_op); MemRefType buffer_type = bd_op.getBuffer().getType(); @@ -242,11 +243,13 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { bd_op.getDimensions(); llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); - // Padding + // Padding std::optional> padDims = bd_op.getPadDimensions(); - llvm::SmallVector padBefore = llvm::SmallVector(4, 0); - llvm::SmallVector padAfter = llvm::SmallVector(4, 0); + llvm::SmallVector padBefore = + llvm::SmallVector(4, 0); + llvm::SmallVector padAfter = + llvm::SmallVector(4, 0); std::fill(padBefore.begin(), padBefore.end(), 0); std::fill(padAfter.begin(), padAfter.end(), 0); @@ -268,20 +271,21 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { input_strides[i] = (*dims)[j].getStride(); } - if(target_model.isMemTile(tile.getCol(), tile.getRow()) && - channelDir == AIE::DMAChannelDir::MM2S){ - if(padDims && (padDims->size() > dims->size())) - return bd_op->emitOpError() << "Mismatch number of dimensions between padding(s)" - << " and wrap(s) and stride(s)."; + if (target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S) { + if (padDims && (padDims->size() > dims->size())) + return bd_op->emitOpError() + << "Mismatch number of dimensions between padding(s)" + << " and wrap(s) and stride(s)."; else if (padDims) for (size_t i = 0; i < padDims->size(); i++) { int j = padDims->size() - i - 1; padBefore[i] = (*padDims)[j].getConstPadBefore(); padAfter[i] = (*padDims)[j].getConstPadAfter(); } - } - else{ - return bd_op->emitOpError() << "supports padding only for MM2S direction on MemTiles."; + } else { + return bd_op->emitOpError() + << "supports padding only for MM2S direction on MemTiles."; } getHardwareStridesWraps(target_model, buffer_type, input_sizes, input_strides, sizes, strides); @@ -313,14 +317,13 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { "transfer length, as this is the BD repeat count."; return failure(); } - } - else{ - if(padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) && - channelDir == AIE::DMAChannelDir::MM2S){ - return bd_op->emitOpError() << "Padding requires n-d data layouts expressed as" - << "wrap(s) and stride(s)."; - } - else if (padDims){ + } else { + if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S) { + return bd_op->emitOpError() + << "Padding requires n-d data layouts expressed as" + << "wrap(s) and stride(s)."; + } else if (padDims) { return bd_op->emitOpError() << "Padding is supported only on MemTiles."; } } @@ -349,8 +352,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /* TODO: Locks */ /*lock_rel_val=*/0, /*lock_rel_id=*/0, /*lock_acq_enable=*/0, /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/padBefore[0], - /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2], - /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1], + /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2], + /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1], /*d2_zero_after=*/padAfter[2]); return setAddressForSingleBD(builder, bd_op, tile); @@ -426,7 +429,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { if (failed(hoistNextBdOpsIntoAttrs(op))) { return failure(); } - + auto channelDir = op.getDirection(); // Lower all BDs diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 90d14c5511..f72d1f8fbf 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -482,9 +482,9 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); - // TODO: Need to add a check to only allow zero padding on MM2S channel of MemTile - // As of now, run time MemTile DMA configuration is supported only from BD level, not at - // NpuDmaMemcpyNdOp. + // TODO: Need to add a check to only allow zero padding on MM2S channel of + // MemTile As of now, run time MemTile DMA configuration is supported only + // from BD level, not at NpuDmaMemcpyNdOp. rewriter.create( op->getLoc(), column, bd_id, buffer_length, buffer_offset, @@ -492,7 +492,7 @@ struct DmaToNpuPattern : OpConversionPattern { d0_stride, d1_size, d1_stride, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id, - d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, + d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, d1_zero_after, d2_zero_after); uint64_t addr = getBufferDescriptorAddressRegisterAddress( @@ -616,7 +616,9 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[7] |= (op.getLockAcqVal() & 0xef) << 5; words[7] |= op.getLockAcqId() & 0xf; - if(op.getD0ZeroBefore() || op.getD1ZeroBefore() || op.getD2ZeroBefore() || op.getD0ZeroAfter() || op.getD1ZeroAfter() || op.getD2ZeroAfter()){ + if (op.getD0ZeroBefore() || op.getD1ZeroBefore() || + op.getD2ZeroBefore() || op.getD0ZeroAfter() || op.getD1ZeroAfter() || + op.getD2ZeroAfter()) { op->emitError("Zero padding is only available on MemTile"); } } else if (tm.isMemTile(op.getColumn(), op.getRow())) { From 6302047dde525a4acd683aef1f6f36d39aae82b7 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:47:43 -0600 Subject: [PATCH 17/37] Missing comma --- python/dialects/aie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dialects/aie.py b/python/dialects/aie.py index 50fba1a393..f5fab1b860 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -411,7 +411,7 @@ def __init__( dimensionsFromStreamPerConsumer=dimensionsFromStreamPerConsumer, via_DMA=via_DMA, plio=plio, - padDimensions=padDimensions + padDimensions=padDimensions, disable_synchronization=disable_synchronization, ) From 30b4f9ce9649cd226bd902381a74017e6241f798 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:50:08 -0600 Subject: [PATCH 18/37] Python format --- python/dialects/aie.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/dialects/aie.py b/python/dialects/aie.py index f5fab1b860..717f430f92 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -106,8 +106,12 @@ def __call__(self, *call_args): def bd_dim_layout(size, stride): return Attribute.parse(f"#aie.bd_dim_layout<{size=}, {stride=}>") + def bd_pad_layout(const_pad_before, const_pad_after): - return Attribute.parse(f"#aie.bd_pad_layout<{const_pad_before=}, {const_pad_after=}>") + return Attribute.parse( + f"#aie.bd_pad_layout<{const_pad_before=}, {const_pad_after=}>" + ) + @register_attribute_builder("BDDimLayoutArrayAttr") def bd_dim_layout_array_attr_builder(tups: List[Attribute | Tuple[int]], context=None): @@ -125,7 +129,8 @@ def bd_dim_layout_array_array_attr_builder(tup_arrs: List[List[tuple]], context= f'#aie', context=context, ) - + + @register_attribute_builder("BDPadLayoutArrayAttr") def bd_pad_layout_array_attr_builder( tups: List[Union[Attribute, Tuple[int]]], context=None From e8b3bbb756e45e342e70acddd78145c62bbd4e15 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:55:49 -0600 Subject: [PATCH 19/37] Python format --- programming_examples/basic/passthrough_dmas/aie2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py index bc833d05d4..3fcc8ae92c 100644 --- a/programming_examples/basic/passthrough_dmas/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -71,4 +71,4 @@ def sequence(A, B, C): print(ctx.module) -my_passthrough() \ No newline at end of file +my_passthrough() From 6b5e0d46d41af3adbf148372b0a6961610aeb3c3 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Wed, 23 Oct 2024 11:56:16 -0600 Subject: [PATCH 20/37] Python format --- test/npu-xrt/memtile_repeat/distribute_repeat/aie2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/npu-xrt/memtile_repeat/distribute_repeat/aie2.py b/test/npu-xrt/memtile_repeat/distribute_repeat/aie2.py index a20e34d374..43acc5fd01 100644 --- a/test/npu-xrt/memtile_repeat/distribute_repeat/aie2.py +++ b/test/npu-xrt/memtile_repeat/distribute_repeat/aie2.py @@ -109,4 +109,4 @@ def sequence(A, B, C): print(ctx.module) -distribute_repeat() \ No newline at end of file +distribute_repeat() From 79348c332a4dff6a774899c909c12e373b3ab2f8 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 10:13:56 -0600 Subject: [PATCH 21/37] Extra case --- lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 117ff51518..262d193321 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -283,7 +283,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { padBefore[i] = (*padDims)[j].getConstPadBefore(); padAfter[i] = (*padDims)[j].getConstPadAfter(); } - } else { + } else if (padDims) { return bd_op->emitOpError() << "supports padding only for MM2S direction on MemTiles."; } From 7db1506785128bb4ed2664e4fbb3e2bd40a47dd8 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 10:29:02 -0600 Subject: [PATCH 22/37] Run command --- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir index 793114ef49..a1fcf96bcd 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir @@ -5,7 +5,7 @@ // // (c) Copyright 2024 AMD Inc. -// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s module { aie.device(npu1_4col) { diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir index 734ed663fd..889ae24b8c 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir @@ -5,7 +5,7 @@ // // (c) Copyright 2024 AMD Inc. -// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s module { aie.device(npu1_4col) { diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir index 68694921b9..818ae75793 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir @@ -5,7 +5,7 @@ // // (c) Copyright 2024 AMD Inc. -// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s module { aie.device(npu1_4col) { diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir index 8ab96d21ef..b9d6316ecd 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir @@ -5,7 +5,7 @@ // // (c) Copyright 2024 AMD Inc. -// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s module { aie.device(npu1_4col) { diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir index 752d30caa8..ff47a067df 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir @@ -5,7 +5,7 @@ // // (c) Copyright 2024 AMD Inc. -// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s module { aie.device(npu1_4col) { From 7975a175a23edaa0dc1d7756056a8681661796be Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 11:35:15 -0600 Subject: [PATCH 23/37] Example code --- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir | 5 ++--- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir | 3 +-- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir | 3 +-- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir | 3 +-- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir | 7 +++---- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir | 6 +++--- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir | 6 +++--- 9 files changed, 16 insertions(+), 21 deletions(-) diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir index a1fcf96bcd..8ff16ccaf1 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir @@ -12,10 +12,9 @@ module { %tile_0_1 = aie.tile(0, 1) %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> - aiex.runtime_sequence(%arg0: memref<32xi8>) { - // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} - + aiex.runtime_sequence(%arg0: memref<32xi8>) { %t1 = aiex.dma_configure_task(%tile_0_1, S2MM, 0) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} aie.dma_bd(%buf : memref<32xi8>, 4, 16, [, , ], []) {bd_id = 0 : i32} aie.end diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir index 889ae24b8c..d0291b038f 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir @@ -14,9 +14,8 @@ module { %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} - %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} aie.dma_bd(%buf : memref<32xi8>, 4, 16, [, , ], []) {bd_id = 0 : i32} aie.end diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir index b9d6316ecd..45f95e0056 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir @@ -14,9 +14,8 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // expected-error@+1 {{Mismatch number of dimensions between padding(s) and wrap(s) and stride(s).}} - %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + // expected-error@+1 {{Mismatch number of dimensions between padding(s) and wrap(s) and stride(s).}} aie.dma_bd(%buf : memref<32xi8>, 4, 16, [], [, ]) {bd_id = 0 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir index ff47a067df..3e58b8a5af 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir @@ -14,9 +14,8 @@ module { %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // expected-error@+1 {{Padding is supported only on MemTiles.}} - %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + // expected-error@+1 {{Padding is supported only on MemTiles.}} aie.dma_bd(%buf : memref<32xi8>, 4, 16, [], []) {bd_id = 0 : i32} aie.end diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir index 2ad275b804..61601b91b3 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir @@ -17,13 +17,13 @@ module { %tile_2_0 = aie.tile(2, 0) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task(%tile_2_0, S2MM, 1) { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} @@ -40,5 +40,4 @@ module { aiex.dma_await_task(%t2) } } -} - +} \ No newline at end of file diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir index b57cbc81bd..286ad32f52 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir @@ -16,11 +16,11 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} - // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} - // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118852 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 0 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir index 191f1511ee..798201879e 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir @@ -16,7 +16,7 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir index 22df05bca5..143feb9e1b 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir @@ -19,7 +19,7 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { aie.dma_bd(%buf : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir index 5a6519a4ee..e026d9e829 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir @@ -16,14 +16,14 @@ module { aie.shim_dma_allocation @alloc1 (S2MM, 1, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task_for @alloc0 { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} - // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task_for @alloc1 { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} aie.end From 5f791fdd35e1b1c454040a3092771b28c29f48f3 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 11:45:19 -0600 Subject: [PATCH 24/37] Push the new change --- .../memtile_padding_test.mlir | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/test/objectFifo-stateful-transform/memtile_padding_test.mlir b/test/objectFifo-stateful-transform/memtile_padding_test.mlir index 9a9651e664..0fe3b75af7 100644 --- a/test/objectFifo-stateful-transform/memtile_padding_test.mlir +++ b/test/objectFifo-stateful-transform/memtile_padding_test.mlir @@ -9,6 +9,136 @@ //===----------------------------------------------------------------------===// // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + // CHECK: %tile_0_0 = aie.tile(0, 0) + // CHECK: %tile_0_1 = aie.tile(0, 1) + // CHECK: %tile_0_2 = aie.tile(0, 2) + // CHECK: %objFifo_out0_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "objFifo_out0_cons_prod_lock"} + // CHECK: %objFifo_out0_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_cons_lock"} + // CHECK: %objFifo_out1_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_out1_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_out1_cons_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out1_cons_prod_lock"} + // CHECK: %objFifo_out1_cons_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_cons_lock"} + // CHECK: %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"} + // CHECK: %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"} + // CHECK: %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"} + // CHECK: %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"} + // CHECK: %objFifo_in1_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_in1_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_in1_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in1_prod_lock"} + // CHECK: %objFifo_in1_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_lock"} + // CHECK: %objFifo_in0_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "objFifo_in0_prod_lock"} + // CHECK: %objFifo_in0_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_lock"} + // CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + // CHECK: aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + // CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) + // CHECK: aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) + // CHECK: %core_0_2 = aie.core(%tile_0_2) { + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: %c0 = arith.constant 0 : index + // CHECK: %c1 = arith.constant 1 : index + // CHECK: %c64 = arith.constant 64 : index + // CHECK: %c12_i8 = arith.constant 12 : i8 + // CHECK: scf.for %arg0 = %c0 to %c64 step %c1 { + // CHECK: scf.for %arg1 = %c0 to %c64 step %c1 { + // CHECK: %0 = memref.load %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8> + // CHECK: %1 = arith.addi %0, %c12_i8 : i8 + // CHECK: memref.store %1, %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8> + // CHECK: } + // CHECK: } + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1) + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, Release, 1) + // CHECK: aie.end + // CHECK: } + // CHECK: aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) + // CHECK: aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64x64xi8> + // CHECK: aiex.npu.dma_wait {symbol = @objFifo_out0} + // CHECK: } + // CHECK: %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + // CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + // CHECK: ^bb1: + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb2 + // CHECK: ^bb2: + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb1 + // CHECK: ^bb3: + // CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + // CHECK: ^bb4: + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb5 + // CHECK: ^bb5: + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb4 + // CHECK: ^bb6: + // CHECK: %2 = aie.dma_start(S2MM, 1, ^bb7, ^bb9) + // CHECK: ^bb7: + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb8 + // CHECK: ^bb8: + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb7 + // CHECK: ^bb9: + // CHECK: %3 = aie.dma_start(MM2S, 1, ^bb10, ^bb12) + // CHECK: ^bb10: + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096, [, ], [, ]) + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb11 + // CHECK: ^bb11: + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096, [, ], [, ]) + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb10 + // CHECK: ^bb12: + // CHECK: aie.end + // CHECK: } + // CHECK: aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) + // CHECK: %mem_0_2 = aie.mem(%tile_0_2) { + // CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + // CHECK: ^bb1: + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb2 + // CHECK: ^bb2: + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb1 + // CHECK: ^bb3: + // CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + // CHECK: ^bb4: + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb5 + // CHECK: ^bb5: + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb4 + // CHECK: ^bb6: + // CHECK: aie.end + // CHECK: } module { aie.device(npu1_1col) { From 6e13cbba18647ebe03305e056ee7c6db31c5b7c8 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 11:54:01 -0600 Subject: [PATCH 25/37] Add zero padding to writebd in trace --- python/utils/trace.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/utils/trace.py b/python/utils/trace.py index 668455881e..07a7a3dcb8 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -527,9 +527,15 @@ def configure_shimtile_tracing_aie2( column=int(shim.col), d0_size=0, d0_stride=0, + d0_zero_after=0, + d0_zero_before=0, d1_size=0, d1_stride=0, + d1_zero_after=0, + d1_zero_before=0, d2_stride=0, + d2_zero_after=0, + d2_zero_before=0, iteration_current=0, iteration_size=0, iteration_stride=0, From 58551712561cdae4363c5fc763fde29a1a08858a Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 15:46:42 -0600 Subject: [PATCH 26/37] Update writebd in tests --- test/Targets/NPU/npu_blockwrite_instgen.mlir | 6 ++++++ test/dialect/AIEX/bad_npu_write_bd.mlir | 8 ++++---- test/python/trace_utils.py | 2 +- test/python/zero_pad.py | 1 + 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/test/Targets/NPU/npu_blockwrite_instgen.mlir b/test/Targets/NPU/npu_blockwrite_instgen.mlir index 4ba0b41342..f38a24d5d0 100644 --- a/test/Targets/NPU/npu_blockwrite_instgen.mlir +++ b/test/Targets/NPU/npu_blockwrite_instgen.mlir @@ -46,9 +46,15 @@ module { row = 1 : i32, d0_stride = 5 : i32, d0_size = 6 : i32, + d0_zero_after = 0 : i32, + d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 8 : i32, + d1_zero_after = 0 : i32, + d1_zero_before = 0 : i32, d2_stride = 9 : i32, + d2_zero_after = 0 : i32, + d2_zero_before = 0 : i32, ddr_id = 10 : i32, iteration_current = 11 : i32, iteration_stride = 12 : i32, diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir index 383f6ac567..5be345197e 100644 --- a/test/dialect/AIEX/bad_npu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -15,7 +15,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -26,7 +26,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -37,7 +37,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -48,7 +48,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } \ No newline at end of file diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index c3a02a201a..8039dda9ea 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -13,7 +13,7 @@ # CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} # CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} # CHECK: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} # CHECK: aiex.npu.address_patch {addr = 118884 : ui32, arg_idx = 2 : i32, arg_plus = 1024 : i32} # CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} diff --git a/test/python/zero_pad.py b/test/python/zero_pad.py index 807a4995ad..8257a18271 100644 --- a/test/python/zero_pad.py +++ b/test/python/zero_pad.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # RUN: %python %s | FileCheck %s +# CHECK: aie.objectfifo @out(%tile_0_1 dimensionsToStream [, ], {%tile_0_0}, 1 : i32) {padDimensions = #aie, ]>} : !aie.objectfifo> import sys from aie.dialects.aie import * From f6ce5c7aaf94cee1cd30ab5ac0836eb4bc389fb4 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 15:48:18 -0600 Subject: [PATCH 27/37] Test case --- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir index 818ae75793..4f95a7581e 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir @@ -14,9 +14,8 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} - %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} aie.dma_bd(%buf : memref<32xi8>, 4, 16, [], []) {bd_id = 0 : i32} aie.end From 6ac952459cca66f0514a80884cbd689b4cfa6911 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 25 Oct 2024 16:01:34 -0600 Subject: [PATCH 28/37] Message error --- lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 262d193321..ac5a2fa822 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -321,7 +321,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) && channelDir == AIE::DMAChannelDir::MM2S) { return bd_op->emitOpError() - << "Padding requires n-d data layouts expressed as" + << "Padding requires n-d data layouts expressed as " << "wrap(s) and stride(s)."; } else if (padDims) { return bd_op->emitOpError() << "Padding is supported only on MemTiles."; diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir index 4f95a7581e..466c73b929 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir @@ -15,7 +15,7 @@ module { aiex.runtime_sequence(%arg0: memref<32xi8>) { %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { - // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} + // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} aie.dma_bd(%buf : memref<32xi8>, 4, 16, [], []) {bd_id = 0 : i32} aie.end From ac5b356d97a537493a9fcbb2f4bacf38d5fa4a23 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 10:46:29 -0600 Subject: [PATCH 29/37] Adding D2Size parameter --- include/aie/Dialect/AIEX/IR/AIEX.td | 1 + lib/Dialect/AIEX/IR/AIEXDialect.cpp | 4 ++++ lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp | 8 ++++++-- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 15 +++++++++++---- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index 6e7d9968df..2e1b2b1d32 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -834,6 +834,7 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> { I32Attr:$d0_stride, I32Attr:$d1_size, I32Attr:$d1_stride, + I32Attr:$d2_size, I32Attr:$d2_stride, I32Attr:$iteration_current, I32Attr:$iteration_size, diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 5ed2a2cd63..f5d2a1b89b 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -459,6 +459,10 @@ LogicalResult AIEX::NpuWriteBdOp::verify() { return emitOpError("Iteration Size exceeds the [0:63] range."); if (getIterationStride() > 0xFFFFF) return emitOpError("Iteration Stride exceeds the [0:1M-1] range."); + if(targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 1) + return emitOpError("ShimTile only supports 2 dimensions of sizes."); + if(targetModel.isShimNOCTile(getColumn(), getRow()) && (getD0ZeroBefore() != 0 || getD0ZeroAfter() != 0 || getD1ZeroBefore() != 0 || getD1ZeroAfter() != 0 || getD2ZeroBefore() != 0 || getD2ZeroAfter() != 0)) + return emitOpError("ShimTile doesn't support zero padding"); return success(); } diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index ac5a2fa822..7fdad51d97 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -243,6 +243,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { bd_op.getDimensions(); llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); + int64_t d2size = 1; + // Padding std::optional> padDims = bd_op.getPadDimensions(); @@ -270,7 +272,9 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { input_sizes[i] = (*dims)[j].getSize(); input_strides[i] = (*dims)[j].getStride(); } - + d2size = (target_model.isMemTile(tile.getCol(), tile.getRow())) + ? (*dims)[2].getSize() + : 1; if (target_model.isMemTile(tile.getCol(), tile.getRow()) && channelDir == AIE::DMAChannelDir::MM2S) { if (padDims && (padDims->size() > dims->size())) @@ -341,7 +345,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /* TODO: Strides/Wraps */ /*d0_size=*/sizes[0], /*d0_stride=*/strides[0], /*d1_size=*/sizes[1], /*d1_stride=*/strides[1], - /*d2_stride=*/strides[2], + /*d2_size=*/d2size, /*d2_stride=*/strides[2], /*iteration_current=*/0, /*iteration_size=*/sizes[3], /*iteration_stride=*/strides[3], /* TODO: Next BD */ diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 54717f2935..0a991f5b65 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -325,6 +325,7 @@ struct DmaToNpuPattern : OpConversionPattern { auto d0_stride = zero; auto d1_size = zero; auto d1_stride = zero; + auto d2_size = zero; auto d2_stride = zero; auto iteration_current = zero; auto iteration_size = zero; @@ -422,6 +423,12 @@ struct DmaToNpuPattern : OpConversionPattern { // d2_stride d2_stride = IntegerAttr::get(i32ty, strides[2]); + + // d2_size + if(targetModel.isMemTile(col, 0)) // Need to be any row + d2_size = IntegerAttr::get(i32ty, sizes[2]); + else + d2_size = IntegerAttr::get(i32ty, 1); } // iteration_current, iteration_size, iteration_stride, repeat_count if (inputSizes[3] > 1) { @@ -481,14 +488,14 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); - // TODO: Need to add a check to only allow zero padding on MM2S channel of - // MemTile As of now, run time MemTile DMA configuration is supported only - // from BD level, not at NpuDmaMemcpyNdOp. + + if(targetModel.isMemTile(col, 0) && (!isMM2S) && (op.getD0ZeroBefore() != 0 || op.getD0ZeroAfter() != 0 || op.getD1ZeroBefore() != 0 || op.getD1ZeroAfter() != 0 || op.getD2ZeroBefore() != 0 || op.getD2ZeroAfter() != 0)) + op->emitOpError("MemTile supports zero padding only on MM2S direction"); rewriter.create( op->getLoc(), column, bd_id, buffer_length, buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, d0_size, - d0_stride, d1_size, d1_stride, d2_stride, iteration_current, + d0_stride, d1_size, d1_stride, d2_size, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id, d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, From 7b76da3a756840702de212f72c470eef04aac52b Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 10:50:24 -0600 Subject: [PATCH 30/37] Empty padDims on MemTile MM2S channel --- .../AIE/Transforms/AIEObjectFifoStatefulTransform.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 535912a9ff..0500a5b572 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -523,11 +523,10 @@ struct AIEObjectFifoStatefulTransformPass createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } else if (op.getProducerTileOp().isMemTile() && - channelDir == DMAChannelDir::MM2S) { + channelDir == DMAChannelDir::MM2S && !pad_dims.empty()) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, pad_dims); - } else if (op.getProducerTileOp().isMemTile() && - channelDir == DMAChannelDir::S2MM) { + } else if (op.getProducerTileOp().isMemTile()) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, nullptr); } else { From 743cbe2e6dc528b24120fc023a8d47a917295956 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 10:53:03 -0600 Subject: [PATCH 31/37] clang-format --- lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp | 6 +++--- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 7fdad51d97..3bfad493e5 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -244,7 +244,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); int64_t d2size = 1; - + // Padding std::optional> padDims = bd_op.getPadDimensions(); @@ -273,8 +273,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { input_strides[i] = (*dims)[j].getStride(); } d2size = (target_model.isMemTile(tile.getCol(), tile.getRow())) - ? (*dims)[2].getSize() - : 1; + ? (*dims)[2].getSize() + : 1; if (target_model.isMemTile(tile.getCol(), tile.getRow()) && channelDir == AIE::DMAChannelDir::MM2S) { if (padDims && (padDims->size() > dims->size())) diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 0a991f5b65..d827915dc0 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -425,7 +425,7 @@ struct DmaToNpuPattern : OpConversionPattern { d2_stride = IntegerAttr::get(i32ty, strides[2]); // d2_size - if(targetModel.isMemTile(col, 0)) // Need to be any row + if (targetModel.isMemTile(col, 0)) // Need to be any row d2_size = IntegerAttr::get(i32ty, sizes[2]); else d2_size = IntegerAttr::get(i32ty, 1); @@ -488,8 +488,10 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); - - if(targetModel.isMemTile(col, 0) && (!isMM2S) && (op.getD0ZeroBefore() != 0 || op.getD0ZeroAfter() != 0 || op.getD1ZeroBefore() != 0 || op.getD1ZeroAfter() != 0 || op.getD2ZeroBefore() != 0 || op.getD2ZeroAfter() != 0)) + if (targetModel.isMemTile(col, 0) && (!isMM2S) && + (op.getD0ZeroBefore() != 0 || op.getD0ZeroAfter() != 0 || + op.getD1ZeroBefore() != 0 || op.getD1ZeroAfter() != 0 || + op.getD2ZeroBefore() != 0 || op.getD2ZeroAfter() != 0)) op->emitOpError("MemTile supports zero padding only on MM2S direction"); rewriter.create( From 5adf8bf476568587adad5f78046315b24d3aea18 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 10:59:49 -0600 Subject: [PATCH 32/37] clang-format --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index f5d2a1b89b..57b10e562b 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -459,9 +459,12 @@ LogicalResult AIEX::NpuWriteBdOp::verify() { return emitOpError("Iteration Size exceeds the [0:63] range."); if (getIterationStride() > 0xFFFFF) return emitOpError("Iteration Stride exceeds the [0:1M-1] range."); - if(targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 1) + if (targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 1) return emitOpError("ShimTile only supports 2 dimensions of sizes."); - if(targetModel.isShimNOCTile(getColumn(), getRow()) && (getD0ZeroBefore() != 0 || getD0ZeroAfter() != 0 || getD1ZeroBefore() != 0 || getD1ZeroAfter() != 0 || getD2ZeroBefore() != 0 || getD2ZeroAfter() != 0)) + if (targetModel.isShimNOCTile(getColumn(), getRow()) && + (getD0ZeroBefore() != 0 || getD0ZeroAfter() != 0 || + getD1ZeroBefore() != 0 || getD1ZeroAfter() != 0 || + getD2ZeroBefore() != 0 || getD2ZeroAfter() != 0)) return emitOpError("ShimTile doesn't support zero padding"); return success(); } From e75688d066c7a02a4a99bc560dde6a75fe82e5d5 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 11:14:08 -0600 Subject: [PATCH 33/37] Reveert changes --- .../AIE/Transforms/AIEObjectFifoStatefulTransform.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 0500a5b572..535912a9ff 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -523,10 +523,11 @@ struct AIEObjectFifoStatefulTransformPass createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } else if (op.getProducerTileOp().isMemTile() && - channelDir == DMAChannelDir::MM2S && !pad_dims.empty()) { + channelDir == DMAChannelDir::MM2S) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, pad_dims); - } else if (op.getProducerTileOp().isMemTile()) { + } else if (op.getProducerTileOp().isMemTile() && + channelDir == DMAChannelDir::S2MM) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, nullptr); } else { From 94229b99bc5e8cfd99d6942bfbce82dbbe122309 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 14:33:22 -0600 Subject: [PATCH 34/37] Changes to tests with D2Size --- python/utils/trace.py | 1 + test/Targets/NPU/npu_blockwrite_instgen.mlir | 1 + test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir | 4 ++-- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir | 6 +++--- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir | 4 ++-- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir | 2 +- test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir | 2 +- test/dialect/AIEX/bad_npu_write_bd.mlir | 8 ++++---- test/python/trace_utils.py | 2 +- 11 files changed, 18 insertions(+), 16 deletions(-) diff --git a/python/utils/trace.py b/python/utils/trace.py index 07a7a3dcb8..291e808d33 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -533,6 +533,7 @@ def configure_shimtile_tracing_aie2( d1_stride=0, d1_zero_after=0, d1_zero_before=0, + d2_size=1, d2_stride=0, d2_zero_after=0, d2_zero_before=0, diff --git a/test/Targets/NPU/npu_blockwrite_instgen.mlir b/test/Targets/NPU/npu_blockwrite_instgen.mlir index f38a24d5d0..9ca60fc63d 100644 --- a/test/Targets/NPU/npu_blockwrite_instgen.mlir +++ b/test/Targets/NPU/npu_blockwrite_instgen.mlir @@ -52,6 +52,7 @@ module { d1_size = 8 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, + d2_size = 1 : i32, d2_stride = 9 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir index 61601b91b3..0638a4ddd7 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir @@ -17,13 +17,13 @@ module { %tile_2_0 = aie.tile(2, 0) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task(%tile_2_0, S2MM, 1) { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir index 286ad32f52..8412749514 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir @@ -16,11 +16,11 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} - // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} - // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118852 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 0 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir index 798201879e..5090e7af53 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir @@ -16,7 +16,7 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir index 143feb9e1b..4c4e6e7a1a 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir @@ -19,7 +19,7 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { aie.dma_bd(%buf : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir index e026d9e829..63e1304561 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir @@ -16,13 +16,13 @@ module { aie.shim_dma_allocation @alloc1 (S2MM, 1, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task_for @alloc0 { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task_for @alloc1 { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir index a6b9fa6bbb..aa72ecfe7f 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir @@ -15,7 +15,7 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 2 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 2 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { aie.dma_bd(%buf : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir index 6baa6be36a..e2f603fff2 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir @@ -15,7 +15,7 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_stride = 0 : i32, d2_zero_after = 1 : i32, d2_zero_before = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 1 : i32, d2_zero_before = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { aie.dma_bd(%buf : memref<32xi8>, 4, 16, diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir index 5be345197e..f1015cb3c5 100644 --- a/test/dialect/AIEX/bad_npu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -15,7 +15,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -26,7 +26,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -37,7 +37,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -48,7 +48,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } \ No newline at end of file diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index 8039dda9ea..6fba1c1cc3 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -13,7 +13,7 @@ # CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} # CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} # CHECK: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} # CHECK: aiex.npu.address_patch {addr = 118884 : ui32, arg_idx = 2 : i32, arg_plus = 1024 : i32} # CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} From 71722af9a5ef5a73ea4c1605642686ceb8f18fe1 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Thu, 31 Oct 2024 15:52:23 -0600 Subject: [PATCH 35/37] D2Size --- .../AIEObjectFifoStatefulTransform.cpp | 4 +- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 4 +- .../AIEX/Transforms/AIEDMATasksToNPU.cpp | 4 +- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 2 +- python/utils/trace.py | 2 +- .../dma-tasks-to-npu/good-1.mlir | 4 +- .../dma-tasks-to-npu/good-2.mlir | 6 +-- .../dma-tasks-to-npu/good-3.mlir | 2 +- .../dma-tasks-to-npu/good-6.mlir | 4 +- test/dialect/AIEX/bad_npu_write_bd.mlir | 50 +++++++++++++++++-- test/npu-xrt/memtile_dmas/writebd/aie.mlir | 8 +-- .../memtile_dmas/writebd_tokens/aie.mlir | 8 +-- .../aie2.py | 7 +++ test/python/trace_utils.py | 2 +- 14 files changed, 79 insertions(+), 28 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 535912a9ff..d882b54740 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -523,7 +523,7 @@ struct AIEObjectFifoStatefulTransformPass createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } else if (op.getProducerTileOp().isMemTile() && - channelDir == DMAChannelDir::MM2S) { + channelDir == DMAChannelDir::MM2S) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, pad_dims); } else if (op.getProducerTileOp().isMemTile() && @@ -1479,7 +1479,7 @@ struct AIEObjectFifoStatefulTransformPass consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, consumerChan.channel, 1, consumerDims, - consumer.getPadDimensionsAttr()); + nullptr); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 57b10e562b..7c6f4afa3a 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -459,13 +459,13 @@ LogicalResult AIEX::NpuWriteBdOp::verify() { return emitOpError("Iteration Size exceeds the [0:63] range."); if (getIterationStride() > 0xFFFFF) return emitOpError("Iteration Stride exceeds the [0:1M-1] range."); - if (targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 1) + if (targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 0) return emitOpError("ShimTile only supports 2 dimensions of sizes."); if (targetModel.isShimNOCTile(getColumn(), getRow()) && (getD0ZeroBefore() != 0 || getD0ZeroAfter() != 0 || getD1ZeroBefore() != 0 || getD1ZeroAfter() != 0 || getD2ZeroBefore() != 0 || getD2ZeroAfter() != 0)) - return emitOpError("ShimTile doesn't support zero padding"); + return emitOpError("ShimTile doesn't support zero padding."); return success(); } diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 3bfad493e5..47e084fc96 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -243,7 +243,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { bd_op.getDimensions(); llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); - int64_t d2size = 1; + int64_t d2size = 0; // Padding std::optional> padDims = @@ -274,7 +274,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { } d2size = (target_model.isMemTile(tile.getCol(), tile.getRow())) ? (*dims)[2].getSize() - : 1; + : 0; if (target_model.isMemTile(tile.getCol(), tile.getRow()) && channelDir == AIE::DMAChannelDir::MM2S) { if (padDims && (padDims->size() > dims->size())) diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index d827915dc0..f5425448d9 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -428,7 +428,7 @@ struct DmaToNpuPattern : OpConversionPattern { if (targetModel.isMemTile(col, 0)) // Need to be any row d2_size = IntegerAttr::get(i32ty, sizes[2]); else - d2_size = IntegerAttr::get(i32ty, 1); + d2_size = IntegerAttr::get(i32ty, 0); } // iteration_current, iteration_size, iteration_stride, repeat_count if (inputSizes[3] > 1) { diff --git a/python/utils/trace.py b/python/utils/trace.py index 291e808d33..8f1b4e6624 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -533,7 +533,7 @@ def configure_shimtile_tracing_aie2( d1_stride=0, d1_zero_after=0, d1_zero_before=0, - d2_size=1, + d2_size=0, d2_stride=0, d2_zero_after=0, d2_zero_before=0, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir index 0638a4ddd7..82ae4df6d1 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir @@ -17,13 +17,13 @@ module { %tile_2_0 = aie.tile(2, 0) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task(%tile_2_0, S2MM, 1) { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir index 8412749514..094284c77f 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir @@ -16,11 +16,11 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} - // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} - // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118852 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 0 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir index 5090e7af53..36c828393c 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir @@ -16,7 +16,7 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir index 63e1304561..f1fe68dda0 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir @@ -16,13 +16,13 @@ module { aie.shim_dma_allocation @alloc1 (S2MM, 1, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task_for @alloc0 { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task_for @alloc1 { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir index f1015cb3c5..a4d4f2a5ae 100644 --- a/test/dialect/AIEX/bad_npu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -26,7 +26,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -37,7 +37,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -48,7 +48,51 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile only supports 2 dimensions of sizes.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 100 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile doesn't support zero padding.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 2 : i32, d2_zero_before = 1 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile doesn't support zero padding.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile doesn't support zero padding.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } \ No newline at end of file diff --git a/test/npu-xrt/memtile_dmas/writebd/aie.mlir b/test/npu-xrt/memtile_dmas/writebd/aie.mlir index eb414000be..c73cdebc9c 100644 --- a/test/npu-xrt/memtile_dmas/writebd/aie.mlir +++ b/test/npu-xrt/memtile_dmas/writebd/aie.mlir @@ -20,14 +20,14 @@ module { aie.flow(%tile_0_1, DMA : 0, %tile_0_0, DMA : 0) aie.shim_dma_allocation @in(MM2S, 0, 0) aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 2 : i32, arg_plus = 0 : i32} aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483648 : ui32} - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 64 : i32, lock_acq_val = 127 : i32, lock_rel_id = 65 : i32, lock_rel_val = 1 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 64 : i32, lock_acq_val = 127 : i32, lock_rel_id = 65 : i32, lock_rel_val = 1 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 656900 : ui32, column = 0 : i32, row = 1 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 65 : i32, lock_acq_val = 127 : i32, lock_rel_id = 64 : i32, lock_rel_val = 1 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 65 : i32, lock_acq_val = 127 : i32, lock_rel_id = 64 : i32, lock_rel_val = 1 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 656948 : ui32, column = 0 : i32, row = 1 : i32, value = 1 : ui32} - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} aiex.npu.maskwrite32 {address = 119296 : ui32, column = 0 : i32, row = 0 : i32, mask = 0x00000F00 : ui32, value = 0x100 : ui32} aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32} diff --git a/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir b/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir index 10e2b0b707..333b4c3972 100644 --- a/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir +++ b/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir @@ -26,18 +26,18 @@ module { aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { // BD0, DMA_S2MM_0_Task_Queue - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 2 : i32, arg_plus = 0 : i32} aiex.npu.maskwrite32 {address = 0x1d200 : ui32, column = 0 : i32, row = 0 : i32, mask = 0x00000F00 : ui32, value = 0x200 : ui32} aiex.npu.write32 {address = 0x1d204 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32} // BD1, DMA_MM2S_0_Task_Queue - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 0x1d024 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32} // BD0, DMA_S2MM_0_Start_Queue - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.maskwrite32 {address = 0xa0600 : ui32, column = 0 : i32, row = 1 : i32, mask = 0x00000F00 : ui32, value = 0x100 : ui32} aiex.npu.write32 {address = 0xa0604 : ui32, column = 0 : i32, row = 1 : i32, value = 0x80000000 : ui32} @@ -45,7 +45,7 @@ module { aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 1 : i32, row_num = 1 : i32} // BD1, DMA_MM2S_0_Start_Queue - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 0xa0634 : ui32, column = 0 : i32, row = 1 : i32, value = 1 : ui32} // sync with the copy out via shimdma diff --git a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py index bb0bdb4203..e42fcb87e7 100644 --- a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py +++ b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py @@ -89,9 +89,16 @@ def sequence(input, output): iteration_stride=0, d0_size=0, d0_stride=0, + d0_zero_after=0, + d0_zero_before=0, d1_size=0, d1_stride=0, + d1_zero_after=0, + d1_zero_before=0, + d2_size=0, d2_stride=0, + d2_zero_after=0, + d2_zero_before=0, enable_packet=0, out_of_order_id=0, packet_id=0, diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index 6fba1c1cc3..f9d57e2108 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -13,7 +13,7 @@ # CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} # CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} # CHECK: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} # CHECK: aiex.npu.address_patch {addr = 118884 : ui32, arg_idx = 2 : i32, arg_plus = 1024 : i32} # CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} From 5e12917024210d305f9391659b18f5c8268e2beb Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 1 Nov 2024 15:40:31 -0600 Subject: [PATCH 36/37] Checking syntax --- .../AIEObjectFifoStatefulTransform.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index d882b54740..c46e3d369c 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -464,9 +464,13 @@ struct AIEObjectFifoStatefulTransformPass builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - if (!dims.getValue().empty() && !padDimensions.getValue().empty()) { - builder.create(builder.getUnknownLoc(), buff, offset, len, dims, - padDimensions); + if (!dims.getValue().empty() && padDimensions) { + if (!padDimensions.getValue().empty()) + builder.create(builder.getUnknownLoc(), buff, offset, len, + dims, padDimensions); + else + builder.create(builder.getUnknownLoc(), buff, offset, len, + dims); } else if (!dims.getValue().empty()) { builder.create(builder.getUnknownLoc(), buff, offset, len, dims); } else { @@ -523,11 +527,13 @@ struct AIEObjectFifoStatefulTransformPass createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } else if (op.getProducerTileOp().isMemTile() && - channelDir == DMAChannelDir::MM2S) { + channelDir == DMAChannelDir::MM2S && + !pad_dims.getValue().empty()) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, pad_dims); } else if (op.getProducerTileOp().isMemTile() && - channelDir == DMAChannelDir::S2MM) { + (channelDir == DMAChannelDir::S2MM || + pad_dims.getValue().empty())) { createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims, nullptr); } else { @@ -1478,8 +1484,7 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, - consumerChan.channel, 1, consumerDims, - nullptr); + consumerChan.channel, 1, consumerDims, nullptr); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); From e5dd19402ee6d25fccf40c882d4dfbb87249d715 Mon Sep 17 00:00:00 2001 From: Pranathi Vasireddy Date: Fri, 1 Nov 2024 15:48:44 -0600 Subject: [PATCH 37/37] Revert change --- .../AIE/Transforms/AIEObjectFifoStatefulTransform.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index c46e3d369c..8ecb3ca2cc 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -464,13 +464,9 @@ struct AIEObjectFifoStatefulTransformPass builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - if (!dims.getValue().empty() && padDimensions) { - if (!padDimensions.getValue().empty()) - builder.create(builder.getUnknownLoc(), buff, offset, len, - dims, padDimensions); - else - builder.create(builder.getUnknownLoc(), buff, offset, len, - dims); + if (!dims.getValue().empty() && !padDimensions.getValue().empty()) { + builder.create(builder.getUnknownLoc(), buff, offset, len, dims, + padDimensions); } else if (!dims.getValue().empty()) { builder.create(builder.getUnknownLoc(), buff, offset, len, dims); } else {