From 496a79dd45f75f6434754aecf263baeffcc3ebb4 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 10:14:36 -0600 Subject: [PATCH 01/11] Add torch to lit cfg for programming_examples --- .github/workflows/buildAndTestRyzenAI.yml | 1 + programming_examples/lit.cfg.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml index acf2262fa2..bc3988e002 100644 --- a/.github/workflows/buildAndTestRyzenAI.yml +++ b/.github/workflows/buildAndTestRyzenAI.yml @@ -127,6 +127,7 @@ jobs: python -m venv aie-venv source aie-venv/bin/activate pip install -r python/requirements.txt + pip install -r python/requirements_ml.txt pip install jupyter sed -i.bak 's/OUTPUT_TIMEOUT = 10/OUTPUT_TIMEOUT = 100/g' \ $(python -c 'import site; print(site.getsitepackages()[0])')/jupyter_client/runapp.py diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index b774bc5280..61acb45937 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -165,6 +165,14 @@ opencv_flags = "" config.substitutions.append(("%opencv_flags", opencv_flags)) +try: + import torch + + config.available_features.add("torch") + except: + print("torch not found") + pass + VitisSysrootFlag = "" if "x86_64" in config.aieHostTarget: config.substitutions.append(("%aieHostTargetTriplet%", "x86_64-unknown-linux-gnu")) From afa454a1c1e65fb03115897dedecb2a9b3d5628a Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 10:25:24 -0600 Subject: [PATCH 02/11] Fixes --- programming_examples/lit.cfg.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 61acb45937..5e1871c57c 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -166,12 +166,12 @@ config.substitutions.append(("%opencv_flags", opencv_flags)) try: - import torch + import torch - config.available_features.add("torch") - except: - print("torch not found") - pass + config.available_features.add("torch") +except ImportError: + print("torch not found") + pass VitisSysrootFlag = "" if "x86_64" in config.aieHostTarget: From f49dcff88378073ff16c288119c130d2ebed468c Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 10:28:24 -0600 Subject: [PATCH 03/11] Update lit.cfg.py torch not found error --- programming_examples/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 5e1871c57c..ffb130bc42 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -170,7 +170,7 @@ config.available_features.add("torch") except ImportError: - print("torch not found") + print("torch not found", file=sys.stder) pass VitisSysrootFlag = "" From 3f4fd7db243e80a199f42daa6347b1767773c3e2 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 10:39:49 -0600 Subject: [PATCH 04/11] Fix typo --- programming_examples/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index ffb130bc42..a03d2c7338 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -170,7 +170,7 @@ config.available_features.add("torch") except ImportError: - print("torch not found", file=sys.stder) + print("torch not found", file=sys.stderr) pass VitisSysrootFlag = "" From a4157264de25f6742190573fe6fa9f79de457bee Mon Sep 17 00:00:00 2001 From: singagan <53442471+singagan@users.noreply.github.com> Date: Mon, 22 Apr 2024 16:17:31 +0200 Subject: [PATCH 05/11] Resnet with python binding (#1368) --- .../ml/resnet/layers_conv2_x/Makefile | 9 +- .../ml/resnet/layers_conv2_x/aie2.py | 1106 ++++++++++------- 2 files changed, 693 insertions(+), 422 deletions(-) diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile index 2f978a05ba..d8f1b7261a 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/Makefile +++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile @@ -12,13 +12,10 @@ mlirFileName = aie all: build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o build/final.xclbin -# build/${mlirFileName}.mlir: aie2.py -# mkdir -p ${@D} -# python3 $< > $@ - -build/${mlirFileName}.mlir: aie.mlir +build/${mlirFileName}.mlir: aie2.py mkdir -p ${@D} - cp $< $@ + python3 $< > $@ + insts.txt: build/${mlirFileName}.mlir aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 385a4fc7a5..235b5c5308 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -8,7 +8,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * from aie.extras.dialects.ext import memref, arith -from aie.extras.dialects.ext.scf import range_, yield_ +from aie.dialects.scf import for_, yield_ from aie.extras.context import mlir_mod_ctx from aie.ir import MemRefType, TypeAttr @@ -21,21 +21,16 @@ # Define bottleneck layer sizes -tensorInW = 32 -tensorInH = 32 -tensorInC = 256 -tensorL1InC = tensorInC -tensorL1OutC = tensorL1InC // 4 +def resnet_conv_x(): -tensorL2InC = tensorL1OutC -tensorL2OutC = tensorL2InC + tensorInW = 32 + tensorInH = 32 + tensorInCInit = 64 + tensorInCRest = 4 * tensorInCInit + n_cols = 3 + repeat = 2 -tensorL3InC = tensorL2OutC -tensorL3OutC = tensorL3InC * 4 - - -def bottleneck4AIEs(): with mlir_mod_ctx() as ctx: @device(AIEDevice.ipu) @@ -44,23 +39,36 @@ def deviceBody(): # define types uint8_ty = IntegerType.get_unsigned(8) int8_ty = IntegerType.get_signless(8) - int16_ty = IntegerType.get_signless(16) int32_ty = IntegerType.get_signless(32) - tensorLayer1In_ty = MemRefType.get( + tensorLayer1In_ty_init = MemRefType.get( ( tensorInW, 1, - tensorL1InC, + tensorInCInit, ), int8_ty, ) - weightsLayer1_ty = MemRefType.get((tensorL1InC * tensorL1OutC,), int8_ty) + tensorLayer1In_ty_rest = MemRefType.get( + ( + tensorInW, + 1, + tensorInCRest, + ), + uint8_ty, + ) + weightsLayer1_ty_init = MemRefType.get( + (tensorInCInit * tensorInCInit,), int8_ty + ) + weightsLayer1_ty_rest = MemRefType.get( + (tensorInCRest * tensorInCInit,), int8_ty + ) + tensorLayer1Out_ty = MemRefType.get( ( tensorInW, 1, - tensorL1OutC, + tensorInCInit, ), uint8_ty, ) @@ -69,18 +77,18 @@ def deviceBody(): ( tensorInW, 1, - tensorL2InC, + tensorInCInit, ), uint8_ty, ) weightsLayer2_ty = MemRefType.get( - (3 * 3 * tensorL2InC * tensorL2OutC,), int8_ty + (3 * 3 * tensorInCInit * tensorInCInit,), int8_ty ) tensorLayer2Out_ty = MemRefType.get( ( tensorInW, 1, - tensorL2OutC // 2, + tensorInCInit // 2, ), uint8_ty, ) @@ -89,35 +97,51 @@ def deviceBody(): ( tensorInW, 1, - tensorL3InC // 2, + tensorInCInit // 2, ), uint8_ty, ) - weightsLayer3_ty = MemRefType.get((tensorL3InC * tensorL3OutC,), int8_ty) + weightsLayer3_ty_init = MemRefType.get( + (2 * tensorInCInit * tensorInCRest,), int8_ty + ) + weightsLayer3_ty_rest = MemRefType.get( + (tensorInCRest // 4 * tensorInCRest,), int8_ty + ) + tensorLayer3Out_ty = MemRefType.get( ( tensorInW, 1, - tensorL3OutC, + tensorInCRest, ), uint8_ty, ) - allWeights_ty = MemRefType.get( + allWeights_ty_init = MemRefType.get( + ( + tensorInCInit * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest + + tensorInCInit * tensorInCRest, + ), + int8_ty, + ) + + allWeights_ty_rest = MemRefType.get( ( - tensorL1InC * tensorL1OutC - + 3 * 3 * tensorL2InC * tensorL2OutC - + tensorL3InC * tensorL3OutC, + tensorInCRest * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest, ), int8_ty, ) # kernel definitions - conv2dk1 = external_func( + conv2dk1_i8 = external_func( "conv2dk1_i8", inputs=[ - tensorLayer1In_ty, - weightsLayer1_ty, + tensorLayer1In_ty_init, + weightsLayer1_ty_init, tensorLayer1Out_ty, int32_ty, int32_ty, @@ -143,14 +167,42 @@ def deviceBody(): int32_ty, ], ) - conv2dk1_skip = external_func( - "conv2dk1_skip_i8", + conv2dk1_skip_init_i8 = external_func( + "conv2dk1_skip_init_i8", + inputs=[ + tensorLayer3In_ty, + tensorLayer3In_ty, + weightsLayer3_ty_init, + tensorLayer3Out_ty, + tensorLayer1In_ty_init, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + conv2dk1_ui8 = external_func( + "conv2dk1_ui8", + inputs=[ + tensorLayer3Out_ty, + weightsLayer1_ty_rest, + tensorLayer1Out_ty, + int32_ty, + int32_ty, + int32_ty, + int32_ty, + ], + ) + + conv2dk1_skip_ui8 = external_func( + "conv2dk1_skip_ui8", inputs=[ tensorLayer3In_ty, tensorLayer3In_ty, - weightsLayer3_ty, + weightsLayer3_ty_rest, + tensorLayer3Out_ty, tensorLayer3Out_ty, - tensorLayer1In_ty, int32_ty, int32_ty, int32_ty, @@ -159,461 +211,658 @@ def deviceBody(): ], ) - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) + ShimTile00 = tile(0, 0) + MemTile01 = tile(0, 1) + ComputeTile02 = tile(0, 2) + ComputeTile03 = tile(0, 3) + ComputeTile04 = tile(0, 4) + ComputeTile05 = tile(0, 5) + + ShimTile10 = tile(1, 0) + MemTile11 = tile(1, 1) + ComputeTile12 = tile(1, 2) + ComputeTile13 = tile(1, 3) + ComputeTile14 = tile(1, 4) + ComputeTile15 = tile(1, 5) + + ShimTile20 = tile(2, 0) + MemTile21 = tile(2, 1) + ComputeTile22 = tile(2, 2) + ComputeTile23 = tile(2, 3) + ComputeTile24 = tile(2, 4) + ComputeTile25 = tile(2, 5) + + shims = [ShimTile00, ShimTile10, ShimTile20] + mems = [MemTile01, MemTile11, MemTile21] + wts_sizes = [allWeights_ty_init, allWeights_ty_rest, allWeights_ty_rest] + layer1_wts_sizes = [ + weightsLayer1_ty_init, + weightsLayer1_ty_rest, + weightsLayer1_ty_rest, + ] + laye1_act_sizes = [ + tensorLayer1In_ty_init, + tensorLayer1In_ty_rest, + tensorLayer1In_ty_rest, + ] + layer3_wts_sizes = [ + weightsLayer3_ty_init, + weightsLayer3_ty_rest, + weightsLayer3_ty_rest, + ] + + cores = [ + [ComputeTile02, ComputeTile03, ComputeTile04, ComputeTile05], + [ComputeTile15, ComputeTile14, ComputeTile13, ComputeTile12], + [ComputeTile22, ComputeTile23, ComputeTile24, ComputeTile25], + ] if enableTrace: - flow(ComputeTile4, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) + flow(ComputeTile04, WireBundle.Trace, 0, ShimTile00, WireBundle.DMA, 1) # runtime parameters - rtpComputeTile2 = Buffer(ComputeTile2, [16], T.i32(), "rtpComputeTile2") - rtpComputeTile3 = Buffer(ComputeTile3, [16], T.i32(), "rtpComputeTile3") - rtpComputeTile4 = Buffer(ComputeTile4, [16], T.i32(), "rtpComputeTile4") - rtpComputeTile5 = Buffer(ComputeTile5, [16], T.i32(), "rtpComputeTile5") - + rtpComputeTile02 = Buffer(ComputeTile02, [16], T.i32(), "rtpComputeTile02") + rtpComputeTile03 = Buffer(ComputeTile03, [16], T.i32(), "rtpComputeTile03") + rtpComputeTile04 = Buffer(ComputeTile04, [16], T.i32(), "rtpComputeTile04") + rtpComputeTile05 = Buffer(ComputeTile05, [16], T.i32(), "rtpComputeTile05") + + rtpComputeTile12 = Buffer(ComputeTile12, [16], T.i32(), "rtpComputeTile12") + rtpComputeTile13 = Buffer(ComputeTile13, [16], T.i32(), "rtpComputeTile13") + rtpComputeTile14 = Buffer(ComputeTile14, [16], T.i32(), "rtpComputeTile14") + rtpComputeTile15 = Buffer(ComputeTile15, [16], T.i32(), "rtpComputeTile15") + + rtpComputeTile22 = Buffer(ComputeTile22, [16], T.i32(), "rtpComputeTile22") + rtpComputeTile23 = Buffer(ComputeTile23, [16], T.i32(), "rtpComputeTile23") + rtpComputeTile24 = Buffer(ComputeTile24, [16], T.i32(), "rtpComputeTile24") + rtpComputeTile25 = Buffer(ComputeTile25, [16], T.i32(), "rtpComputeTile25") + + rtp = [ + [ + rtpComputeTile02, + rtpComputeTile03, + rtpComputeTile04, + rtpComputeTile05, + ], + [ + rtpComputeTile15, + rtpComputeTile14, + rtpComputeTile13, + rtpComputeTile12, + ], + [ + rtpComputeTile22, + rtpComputeTile23, + rtpComputeTile24, + rtpComputeTile25, + ], + ] + rtp_name = [ + [ + "rtpComputeTile02", + "rtpComputeTile03", + "rtpComputeTile04", + "rtpComputeTile05", + ], + [ + "rtpComputeTile12", + "rtpComputeTile13", + "rtpComputeTile14", + "rtpComputeTile15", + ], + [ + "rtpComputeTile22", + "rtpComputeTile23", + "rtpComputeTile24", + "rtpComputeTile25", + ], + ] # set up data movement with OFs + conv1_kernels = ["conv2dk1_i8.o", "conv2dk1_ui8.o", "conv2dk1_ui8.o"] + conv1_kernels_call = [conv2dk1_i8, conv2dk1_ui8, conv2dk1_ui8] + + conv3_kernels = [ + "conv2dk1_skip_init.o", + "conv2dk1_skip.o", + "conv2dk1_skip.o", + ] + conv3_kernels_call = [ + conv2dk1_skip_init_i8, + conv2dk1_skip_ui8, + conv2dk1_skip_ui8, + ] + + act1_fifo_names = ["act1_00_02_01", "act1_04_15_01", "act1_13_22_21"] + act1_fifos = {} + + wts_fifo_names = ["wts_0_L3L2", "wts_1_L3L2", "wts_2_L3L2"] + wts_fifos = {} + wts_sub_fifo_names = [ + ["wts_buf_00", "wts_buf_01", "wts_buf_02"], + ["wts_buf_10", "wts_buf_11", "wts_buf_12"], + ["wts_buf_20", "wts_buf_21", "wts_buf_22"], + ] + wts_sub_fifos = {} + + for i in range(n_cols): + wts_fifos[wts_fifo_names[i]] = object_fifo( + wts_fifo_names[i], shims[i], mems[i], 1, wts_sizes[i] + ) + wts_sub_fifos[wts_sub_fifo_names[i][0]] = object_fifo( + wts_sub_fifo_names[i][0], + mems[i], + cores[i][0], + 1, + layer1_wts_sizes[i], + ) + wts_sub_fifos[wts_sub_fifo_names[i][1]] = object_fifo( + wts_sub_fifo_names[i][1], + mems[i], + [cores[i][1], cores[i][3]], + 1, + weightsLayer2_ty, + ) + wts_sub_fifos[wts_sub_fifo_names[i][2]] = object_fifo( + wts_sub_fifo_names[i][2], + mems[i], + cores[i][2], + 1, + layer3_wts_sizes[i], + ) + object_fifo_link( + wts_fifo_names[i], + [ + wts_sub_fifo_names[i][0], + wts_sub_fifo_names[i][1], + wts_sub_fifo_names[i][2], + ], + ) + # input tensor (with broadcast for skip connection) - of_inOF_act_L3L2 = object_fifo( - "inOF_act_L3L2", - ShimTile, - [ComputeTile2, MemTile], + act1_fifo_names = ["act1_00_02_01", "act1_04_15_11", "act1_13_22_21"] + act1_fifos = {} + + skip_fifo_names = ["skip_0", "skip_1", "skip_2"] + skip_fifos = {} + + act1_fifos[act1_fifo_names[0]] = object_fifo( + act1_fifo_names[0], + shims[0], + [cores[0][0], mems[0]], [2, 2, 4], - tensorLayer1In_ty, + laye1_act_sizes[0], ) - of_skip_buf = object_fifo( - "skip_buf", MemTile, ComputeTile4, 2, tensorLayer1In_ty + skip_fifos[skip_fifo_names[0]] = object_fifo( + skip_fifo_names[0], mems[0], cores[0][2], 2, laye1_act_sizes[0] ) - object_fifo_link(of_inOF_act_L3L2, of_skip_buf) + object_fifo_link(act1_fifo_names[0], skip_fifo_names[0]) + + for i in range(1, repeat + 1): + act1_fifos[act1_fifo_names[i]] = object_fifo( + act1_fifo_names[i], + cores[i - 1][2], + [cores[i][0], mems[i - 1]], + [2, 2, 4], + laye1_act_sizes[i], + ) + skip_fifos[skip_fifo_names[i]] = object_fifo( + skip_fifo_names[i], + mems[i - 1], + cores[i][2], + 2, + laye1_act_sizes[i], + ) + object_fifo_link(act1_fifo_names[i], skip_fifo_names[i]) - # weights - inOF_wts_0_L3L2 = object_fifo( - "inOF_wts_0_L3L2", ShimTile, MemTile, 1, allWeights_ty - ) - of_wts_buf_00 = object_fifo( - "wts_buf_00", MemTile, ComputeTile2, 1, weightsLayer1_ty - ) - wts_buf_01 = object_fifo( - "wts_buf_01", - MemTile, - [ComputeTile3, ComputeTile5], - 1, - weightsLayer2_ty, - ) - wts_buf_02 = object_fifo( - "wts_buf_02", MemTile, ComputeTile4, 1, weightsLayer3_ty - ) - object_fifo_link(inOF_wts_0_L3L2, [of_wts_buf_00, wts_buf_01, wts_buf_02]) - - # activation tensor - of_act_2_3_5 = object_fifo( - "act_2_3_5", - ComputeTile2, - [ComputeTile3, ComputeTile5], - [2, 4, 4], - tensorLayer1Out_ty, - ) # 1x1 -> 3x3 - act_3_4 = object_fifo( - "act_3_4", ComputeTile3, ComputeTile4, 2, tensorLayer2Out_ty - ) # 3x3 -> 1x1 - act_5_4 = object_fifo( - "act_5_4", ComputeTile5, ComputeTile4, 2, tensorLayer2Out_ty - ) # 3x3 -> 1x1 + act2_fifo_names = ["act2_02_03_05", "act2_15_12_14", "act2_22_23_25"] + act2_fifos = {} + + act3_fifo_names_1 = ["act3_03_04", "act3_14_13", "act3_23_24"] + act3_fifo_1 = {} + + act3_fifo_names_2 = ["act3_05_04", "act3_12_13", "act3_25_24"] + act3_fifo_2 = {} + + for i in range(n_cols): + # 1x1 -> 3x3 + act2_fifos[act2_fifo_names[i]] = object_fifo( + act2_fifo_names[i], + cores[i][0], + [cores[i][1], cores[i][3]], + 2, + tensorLayer1Out_ty, + ) + + # 3x3 -> 1x1 + act3_fifo_1[act3_fifo_names_1[i]] = object_fifo( + act3_fifo_names_1[i], + cores[i][1], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) + # 3x3 -> 1x1 + act3_fifo_2[act3_fifo_names_2[i]] = object_fifo( + act3_fifo_names_2[i], + cores[i][3], + cores[i][2], + 2, + tensorLayer2Out_ty, + ) # output tensor outOFL2L3 = object_fifo( - "outOFL2L3", ComputeTile4, ShimTile, 2, tensorLayer3Out_ty + "outOFL2L3", cores[2][2], shims[2], 2, tensorLayer3Out_ty ) - - # 1x1 conv2d - @core(ComputeTile2, "conv2dk1.o") - def core_body(): - for _ in range_(sys.maxsize): - - # acquire weights once - element0Weights = of_wts_buf_00.acquire(ObjectFifoPort.Consume, 1) - scale = memref.load(rtpComputeTile2, [0]) - for _ in range_(tensorInH): - element0ActivactionsIn = of_inOF_act_L3L2.acquire( - ObjectFifoPort.Consume, 1 + conv3_out_fifo = [ + act1_fifos[act1_fifo_names[1]], + act1_fifos[act1_fifo_names[2]], + outOFL2L3, + ] + conv3_out_fifo_names = ["act1_04_15_11", "act1_13_22_21", "outOFL2L3"] + # # 1x1 conv2d + for i in range(n_cols): + + @core(cores[i][0], conv1_kernels[i]) + def core_body(): + for _ in for_(sys.maxsize): + + # acquire weights once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][0] + ].acquire(ObjectFifoPort.Consume, 1) + scale = memref.load(rtp[i][0], [0]) + for _ in for_(tensorInH): + element0ActivactionsIn = act1_fifos[ + act1_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 1) + element0ActivactionsOut = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv1_kernels_call[i], + [ + element0ActivactionsIn, + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit, + scale, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act1_fifo_names[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Produce, act2_fifo_names[i], 1 + ) + yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][0], 1 ) - element0ActivactionsOut = of_act_2_3_5.acquire( - ObjectFifoPort.Produce, 1 + yield_([]) + + # 3x3 conv2d OFM 0-31 + for i in range(n_cols): + + @core(cores[i][1], "conv2dk3.o") + def core_body(): + scale = 11 + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][1] + ].acquire(ObjectFifoPort.Consume, 1) + # scale = memref.load(rtpComputeTile03, 0) + + # pre-amble: top row + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 ) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) res = call( - conv2dk1, + conv2dk3, [ - element0ActivactionsIn, + elementActivactionsIn[0], + elementActivactionsIn[0], + elementActivactionsIn[1], element0Weights, element0ActivactionsOut, tensorInW, - tensorL1InC, - tensorL1OutC, + tensorInCInit, + tensorInCInit, + 3, + 3, + 0, scale, + 0, ], ) - - objectfifo_release(ObjectFifoPort.Consume, "inOF_act_L3L2", 1) - - objectfifo_release(ObjectFifoPort.Produce, "act_2_3_5", 1) - yield_([]) - objectfifo_release(ObjectFifoPort.Consume, "wts_buf_00", 1) - yield_([]) - - # 3x3 conv2d OFM 0-31 - @core(ComputeTile3, "conv2dk3.o") - def core_body(): - scale = 11 - for _ in range_(sys.maxsize): - - # acquire weights and rtps once - element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1) - # scale = memref.load(rtpComputeTile3, 0) - - # pre-amble: top row - elementActivactionsIn = of_act_2_3_5.acquire( - ObjectFifoPort.Consume, 2 - ) - element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1) - res = call( - conv2dk3, - [ - elementActivactionsIn[0], - elementActivactionsIn[0], - elementActivactionsIn[1], - element0Weights, - element0ActivactionsOut, - tensorInW, - tensorL2InC, - tensorL2OutC, - 3, - 3, - 0, - scale, - 0, - ], - ) - objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1) - - # middle - for _ in range_(tensorInH - 2): - elementActivactionsIn = of_act_2_3_5.acquire( - ObjectFifoPort.Consume, 3 + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 ) - element0ActivactionsOut = act_3_4.acquire( - ObjectFifoPort.Produce, 1 + + # middle + for _ in for_(tensorInH - 2): + elementActivactionsIn = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 3) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[2], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit, + 3, + 3, + 1, + scale, + 0, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) + yield_([]) + + # last part + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 ) + element0ActivactionsOut = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Produce, 1) res = call( conv2dk3, [ elementActivactionsIn[0], elementActivactionsIn[1], - elementActivactionsIn[2], + elementActivactionsIn[1], element0Weights, element0ActivactionsOut, tensorInW, - tensorL2InC, - tensorL2OutC, + tensorInCInit, + tensorInCInit, 3, 3, - 1, + 2, scale, 0, ], ) - objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1) - objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1) - yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 2 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 + ) - # last part - elementActivactionsIn = of_act_2_3_5.acquire( - ObjectFifoPort.Consume, 2 - ) - element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1) - res = call( - conv2dk3, - [ - elementActivactionsIn[0], - elementActivactionsIn[1], - elementActivactionsIn[1], - element0Weights, - element0ActivactionsOut, - tensorInW, - tensorL2InC, - tensorL2OutC, - 3, - 3, - 2, - scale, - 0, - ], - ) - - objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2) - objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1) - - objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1) - yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 + ) + yield_([]) # 3x3 conv2d OFM 32-63 - @core(ComputeTile5, "conv2dk3.o") - def core_body(): - scale = 11 - for _ in range_(sys.maxsize): - - # acquire weights and rtps once - element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1) - # scale = memref.load(rtpComputeTile5, 0) - - # pre-amble: top row - elementActivactionsIn = of_act_2_3_5.acquire( - ObjectFifoPort.Consume, 2 - ) - element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1) - res = call( - conv2dk3, - [ - elementActivactionsIn[0], - elementActivactionsIn[0], - elementActivactionsIn[1], - element0Weights, - element0ActivactionsOut, - tensorInW, - tensorL2InC, - tensorL2OutC, - 3, - 3, - 0, - scale, - tensorL2OutC // 2, - ], - ) - - objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1) - - # middle - for _ in range_(tensorInH - 2): - elementActivactionsIn = of_act_2_3_5.acquire( - ObjectFifoPort.Consume, 3 - ) - element0ActivactionsOut = act_5_4.acquire( - ObjectFifoPort.Produce, 1 + + for i in range(n_cols): + + @core(cores[i][3], "conv2dk3.o") + def core_body(): + scale = 11 + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][1] + ].acquire(ObjectFifoPort.Consume, 1) + # scale = memref.load(rtpComputeTile05, 0) + + # pre-amble: top row + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 ) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) res = call( conv2dk3, [ + elementActivactionsIn[0], elementActivactionsIn[0], elementActivactionsIn[1], - elementActivactionsIn[2], element0Weights, element0ActivactionsOut, tensorInW, - tensorL2InC, - tensorL2OutC, + tensorInCInit, + tensorInCInit, 3, 3, - 1, + 0, scale, - tensorL2OutC // 2, + tensorInCInit // 2, ], ) - objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1) - objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1) - yield_([]) - - # last part - elementActivactionsIn = of_act_2_3_5.acquire( - ObjectFifoPort.Consume, 2 - ) - element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1) - res = call( - conv2dk3, - [ - elementActivactionsIn[0], - elementActivactionsIn[1], - elementActivactionsIn[1], - element0Weights, - element0ActivactionsOut, - tensorInW, - tensorL2InC, - tensorL2OutC, - 3, - 3, - 2, - scale, - tensorL2OutC // 2, - ], - ) - objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2) - objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1) - objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1) - yield_([]) - - # # 1x1 conv2d and add skip - @core(ComputeTile4, "conv2dk1_skip.o") - def core_body(): - for _ in range_(sys.maxsize): - - # acquire weights and rtps once - element0Weights = wts_buf_02.acquire(ObjectFifoPort.Consume, 1) - scale = memref.load(rtpComputeTile4, [0]) - skipScale = memref.load(rtpComputeTile4, [1]) - - for _ in range_(tensorInH): - element0ActivactionsIn = act_3_4.acquire( - ObjectFifoPort.Consume, 1 - ) - element1ActivactionsIn = act_5_4.acquire( - ObjectFifoPort.Consume, 1 - ) - elementSkipsIn = of_skip_buf.acquire(ObjectFifoPort.Consume, 1) - elementActivactionsOut = outOFL2L3.acquire( - ObjectFifoPort.Produce, 1 + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 ) - call( - conv2dk1_skip, + # middle + for _ in for_(tensorInH - 2): + elementActivactionsIn = act2_fifos[ + act2_fifo_names[i] + ].acquire(ObjectFifoPort.Consume, 3) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, + [ + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[2], + element0Weights, + element0ActivactionsOut, + tensorInW, + tensorInCInit, + tensorInCInit, + 3, + 3, + 1, + scale, + tensorInCInit // 2, + ], + ) + + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + yield_([]) + + # last part + elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 2 + ) + element0ActivactionsOut = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Produce, 1) + res = call( + conv2dk3, [ - element0ActivactionsIn, - element1ActivactionsIn, + elementActivactionsIn[0], + elementActivactionsIn[1], + elementActivactionsIn[1], element0Weights, - elementActivactionsOut, - elementSkipsIn, + element0ActivactionsOut, tensorInW, - tensorL3InC, - tensorL3OutC, + tensorInCInit, + tensorInCInit, + 3, + 3, + 2, scale, - skipScale, + tensorInCInit // 2, ], ) - objectfifo_release(ObjectFifoPort.Produce, "outOFL2L3", 1) - objectfifo_release(ObjectFifoPort.Consume, "act_3_4", 1) - objectfifo_release(ObjectFifoPort.Consume, "act_5_4", 1) - objectfifo_release(ObjectFifoPort.Consume, "skip_buf", 1) + objectfifo_release( + ObjectFifoPort.Consume, act2_fifo_names[i], 2 + ) + objectfifo_release( + ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 + ) + yield_([]) + + # # 1x1 conv2d and add skip + for i in range(n_cols): + + @core(cores[i][2], conv3_kernels[i]) + def core_body(): + for _ in for_(sys.maxsize): + + # acquire weights and rtps once + element0Weights = wts_sub_fifos[ + wts_sub_fifo_names[i][2] + ].acquire(ObjectFifoPort.Consume, 1) + scale = memref.load(rtp[i][2], [0]) + skipScale = memref.load(rtp[i][2], [1]) + + for _ in for_(tensorInH): + element0ActivactionsIn = act3_fifo_1[ + act3_fifo_names_1[i] + ].acquire(ObjectFifoPort.Consume, 1) + element1ActivactionsIn = act3_fifo_2[ + act3_fifo_names_2[i] + ].acquire(ObjectFifoPort.Consume, 1) + + elementActivactionsOut = conv3_out_fifo[i].acquire( + ObjectFifoPort.Produce, 1 + ) + elementSkipsIn = skip_fifos[skip_fifo_names[i]].acquire( + ObjectFifoPort.Consume, 1 + ) + call( + conv3_kernels_call[i], + [ + element0ActivactionsIn, + element1ActivactionsIn, + element0Weights, + elementActivactionsOut, + elementSkipsIn, + tensorInW, + tensorInCInit, + tensorInCRest, + scale, + skipScale, + ], + ) + objectfifo_release( + ObjectFifoPort.Consume, act3_fifo_names_1[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Consume, act3_fifo_names_2[i], 1 + ) + objectfifo_release( + ObjectFifoPort.Produce, conv3_out_fifo_names[i], 1 + ) + + objectfifo_release( + ObjectFifoPort.Consume, skip_fifo_names[i], 1 + ) + yield_([]) + objectfifo_release( + ObjectFifoPort.Consume, wts_sub_fifo_names[i][2], 1 + ) yield_([]) - objectfifo_release(ObjectFifoPort.Consume, "wts_buf_02", 1) - yield_([]) # instruction stream generation - activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4 - acitivationsOutSize32b = activationsInSize32b - totalWeightsSize32b = ( - tensorL1InC * tensorL1OutC - + 3 * 3 * tensorL2InC * tensorL2OutC - + tensorL3InC * tensorL3OutC + activationsInSize32b = (tensorInW * tensorInH * tensorInCInit) // 4 + acitivationsOutSize32b = (tensorInW * tensorInH * tensorInCRest) // 4 + + totalWeightsSize32b_init = ( + tensorInCInit * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + 2 * tensorInCInit * tensorInCRest + ) // 4 + + totalWeightsSize32b_rest = ( + tensorInCInit * tensorInCRest + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest ) // 4 + totalWeightsSize32b_complete = ( + totalWeightsSize32b_init + repeat * totalWeightsSize32b_rest + ) + activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty) - weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty) + activationsOutL3_ty = MemRefType.get((acitivationsOutSize32b,), int32_ty) + weightsInL3_ty_init = MemRefType.get((totalWeightsSize32b_init,), int32_ty) + weightsInL3_ty_rest = MemRefType.get((totalWeightsSize32b_rest,), int32_ty) - @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty) + weightsInL3_ty_complete = MemRefType.get( + (totalWeightsSize32b_complete,), int32_ty + ) + + @FuncOp.from_py_func( + activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty + ) def sequence(inputFromL3, weightsFromL3, outputToL3): - if enableTrace: - # Trace output - - # Trace_Event0, Trace_Event1: Select which events to trace. - # Note that the event buffers only appear to be transferred to DDR in - # bursts of 256 bytes. If less than 256 bytes are written, you may not - # see trace output, or only see it on the next iteration of your - # kernel invocation, as the buffer gets filled up. Note that, even - # though events are encoded as 4 byte words, it may take more than 64 - # events to fill the buffer to 256 bytes and cause a flush, since - # multiple repeating events can be 'compressed' by the trace mechanism. - # In order to always generate sufficient events, we add the "assert - # TRUE" event to one slot, which fires every cycle, and thus fills our - # buffer quickly. - - # Some events: - # TRUE (0x01) - # STREAM_STALL (0x18) - # LOCK_STALL (0x1A) - # EVENTS_CORE_INSTR_EVENT_1 (0x22) - # EVENTS_CORE_INSTR_EVENT_0 (0x21) - # INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction - # INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock .acquire instruction - # INSTR_LOCK_.release_REQ (0x2D) Core executes a lock .release instruction - # EVENTS_CORE_PORT_RUNNING_1 (0x4F) - # EVENTS_CORE_PORT_RUNNING_0 (0x4B) - - # Trace_Event0 (4 slots) - ipu_write32(0, 4, 0x340E0, 0x4B222125) - # Trace_Event1 (4 slots) - ipu_write32(0, 4, 0x340E4, 0x2D2C1A4F) - - # Event slots as configured above: - # 0: Kernel executes vector instruction - # 1: Event 0 -- Kernel starts - # 2: Event 1 -- Kernel done - # 3: Port_Running_0 - # 4: Port_Running_1 - # 5: Lock Stall - # 6: Lock .acquire Instr - # 7: Lock .release Instr - - # Stream_Switch_Event_Port_Selection_0 - # This is necessary to capture the Port_Running_0 and Port_Running_1 events - ipu_write32(0, 4, 0x3FF00, 0x121) - - # Trace_Control0: Define trace start and stop triggers. Set start event TRUE. - ipu_write32(0, 4, 0x340D0, 0x10000) - - # Start trace copy out. - ipu_writebd_shimtile( - bd_id=3, - buffer_length=trace_sz_in_i32s, - buffer_offset=acitivationsOutSize32b, - enable_packet=0, - out_of_order_id=0, - packet_id=0, - packet_type=0, - column=0, - column_num=1, - d0_stepsize=0, - d0_wrap=0, - d1_stepsize=0, - d1_wrap=0, - d2_stepsize=0, - ddr_id=2, - iteration_current=0, - iteration_stepsize=0, - iteration_wrap=0, - lock_acq_enable=0, - lock_acq_id=0, - lock_acq_val=0, - lock_rel_id=0, - lock_rel_val=0, - next_bd=0, - use_next_bd=0, - valid_bd=1, - ) - ipu_write32(0, 2, 0x1D20C, 0x3) - - # write RTP parameters - IpuWriteRTPOp( - "rtpComputeTile2", col=0, row=2, index=0, value=1 - ) # scale - IpuWriteRTPOp( - "rtpComputeTile3", col=0, row=3, index=0, value=1 - ) # scale - IpuWriteRTPOp( - "rtpComputeTile5", col=0, row=5, index=0, value=1 - ) # scale - IpuWriteRTPOp( - "rtpComputeTile4", col=0, row=4, index=0, value=1 - ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input - IpuWriteRTPOp( - "rtpComputeTile4", col=0, row=4, index=1, value=0 - ) # skip_scale + for c, col in enumerate(rtp_name): + for r, row in enumerate(col): + IpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale + + IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=0) + IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=1) + + IpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=0) + + IpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=0) + + # # # write RTP parameters + # IpuWriteRTPOp( + # "rtpComputeTile02", col=0, row=2, index=0, value=1 + # ) # scale + # IpuWriteRTPOp( + # "rtpComputeTile03", col=0, row=3, index=0, value=1 + # ) # scale + # IpuWriteRTPOp( + # "rtpComputeTile05", col=0, row=5, index=0, value=1 + # ) # scale + # IpuWriteRTPOp( + # "rtpComputeTile04", col=0, row=4, index=0, value=1 + # ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input + # IpuWriteRTPOp( + # "rtpComputeTile04", col=0, row=4, index=1, value=0 + # ) # skip_scale ipu_dma_memcpy_nd( - metadata="inOF_act_L3L2", + metadata="act1_00_02_01", bd_id=0, mem=inputFromL3, sizes=[1, 1, 1, activationsInSize32b], @@ -625,15 +874,40 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): sizes=[1, 1, 1, acitivationsOutSize32b], ) ipu_dma_memcpy_nd( - metadata="inOF_wts_0_L3L2", + metadata="wts_0_L3L2", + bd_id=1, + mem=weightsFromL3, + sizes=[1, 1, 1, totalWeightsSize32b_init], + ) + + ipu_dma_memcpy_nd( + metadata="wts_1_L3L2", + bd_id=1, + mem=weightsFromL3, + offsets=[0, 0, 0, totalWeightsSize32b_init], + sizes=[1, 1, 1, totalWeightsSize32b_rest], + ) + + ipu_dma_memcpy_nd( + metadata="wts_2_L3L2", bd_id=1, mem=weightsFromL3, - sizes=[1, 1, 1, totalWeightsSize32b], + offsets=[ + 0, + 0, + 0, + totalWeightsSize32b_init + totalWeightsSize32b_rest, + ], + sizes=[1, 1, 1, totalWeightsSize32b_rest], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + ipu_sync(column=1, row=0, direction=0, channel=0) - print(ctx.module) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) -bottleneck4AIEs() +resnet_conv_x() From 384650c5f036a0e7c7f3499d2140462a76aa0b8a Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 22 Apr 2024 10:15:03 -0600 Subject: [PATCH 06/11] readme update --- programming_examples/ml/bottleneck/README.md | 7 ---- programming_examples/ml/conv2d/README.md | 9 +---- .../ml/conv2d_fused_relu/README.md | 9 +---- programming_examples/ml/resnet/README.md | 8 ---- programming_guide/section-6/README.md | 39 +++++++++++++++++++ 5 files changed, 41 insertions(+), 31 deletions(-) create mode 100644 programming_guide/section-6/README.md diff --git a/programming_examples/ml/bottleneck/README.md b/programming_examples/ml/bottleneck/README.md index 144b8e36f2..40a69e8576 100644 --- a/programming_examples/ml/bottleneck/README.md +++ b/programming_examples/ml/bottleneck/README.md @@ -115,11 +115,4 @@ make To run the design: ``` make run_py -``` - -### Prerequisites -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - ``` \ No newline at end of file diff --git a/programming_examples/ml/conv2d/README.md b/programming_examples/ml/conv2d/README.md index 81b25f3e52..b2d93f066d 100644 --- a/programming_examples/ml/conv2d/README.md +++ b/programming_examples/ml/conv2d/README.md @@ -56,12 +56,5 @@ make To run the design: ``` -make run -``` - -### Prerequisites -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - +make run_py ``` \ No newline at end of file diff --git a/programming_examples/ml/conv2d_fused_relu/README.md b/programming_examples/ml/conv2d_fused_relu/README.md index 68e7e9b8cf..3f4a2264cd 100644 --- a/programming_examples/ml/conv2d_fused_relu/README.md +++ b/programming_examples/ml/conv2d_fused_relu/README.md @@ -88,12 +88,5 @@ make To run the design: ``` -make run -``` - -### Prerequisites -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - +make run_py ``` \ No newline at end of file diff --git a/programming_examples/ml/resnet/README.md b/programming_examples/ml/resnet/README.md index 6382079c62..de4cc92535 100755 --- a/programming_examples/ml/resnet/README.md +++ b/programming_examples/ml/resnet/README.md @@ -107,14 +107,6 @@ To run the design: make run_py ``` -### Prerequisites - -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - -``` - ## References [1] He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770-778). diff --git a/programming_guide/section-6/README.md b/programming_guide/section-6/README.md new file mode 100644 index 0000000000..f54c812ab3 --- /dev/null +++ b/programming_guide/section-6/README.md @@ -0,0 +1,39 @@ + + +# Section 6 - Larger Example Designs + +There are a number of example designs available [here](../../programming_examples/) which further help explain many of the unique features of AI Engines and the NPU array in Ryzen™ AI. This section contains more complex application designs for both vision and machine learning use cases. In particular we will describe a ResNet implementation on for Ryzen™ AI. + +## Vision Kernels + +| Design name | Data type | Description | +|-|-|-| +| [Vision Passthrough](../../programming_examples/vision/vision_passthrough/) | i8 | A simple pipeline with just one `passThrough` kernel. This pipeline's main purpose is to test whether the data movement works correctly to copy a greyscale image. | +| [Color Detect](../../programming_examples/vision/color_detect/) | i32 | This multi-kernel, multi-core pipeline detects colors in an RGBA image. | +| [Edge Detect](../../programming_examples/vision/edge_detect/) | i32 | A mult-kernel, multi-core pipeline that detects edges in an image and overlays the detection on the original image. | +| [Color Threshold](../../programming_examples/vision/color_threshold/) | i32 | A mult-core data-parallel implementation of color thresholding of a RGBA image. | + + +## Machine Learning Designs + +| Design name | Data type | Description | +|-|-|-| +|[bottleneck](../../programming_examples/ml/bottleneck/)|ui8|A Bottleneck Residual Block is a variant of the residual block that utilises three convolutions, using 1x1, 3x3 and 1x1 filter sizes, respectively. The use of a bottleneck reduces the number of parameters and computations.| +|[resnet](../../programming_examples/ml/resnet/)|ui8|ResNet with offloaded conv2_x bottleneck blocks. The implementation features kernel fusion and dataflow optimizations highlighting the unique architectural capabilties of AI Engines.| + +## Exercises + +1. In [bottlneck](../../programming_examples/ml/bottleneck/) design following a dataflow approach, how many elements does the 3x3 convolution operation require to proceed with its computation? +2. Suppose you have a bottleneck block with input dimensions of 32x32x256. After passing through the 1x1 convolutional layer, the output dimensions become 32x32x64. What would be the output dimensions after the subsequent 3x3 convolutional layer, assuming a stride of 1 and no padding and output channel of 64? + +----- +[[Prev - Section 5](../section-5/)] [[Top](..)] + From 16b1be502cab1ad60bde59a5af8367ca2ee7b2b9 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 22 Apr 2024 10:45:55 -0600 Subject: [PATCH 07/11] conv2d runtime fix --- programming_examples/ml/conv2d/Makefile | 2 +- programming_examples/ml/conv2d/run.lit | 2 +- programming_examples/ml/conv2d/test.py | 267 ++++++++++++------------ 3 files changed, 139 insertions(+), 132 deletions(-) diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile index 0274f3fef7..0f4c925ed3 100755 --- a/programming_examples/ml/conv2d/Makefile +++ b/programming_examples/ml/conv2d/Makefile @@ -32,4 +32,4 @@ clean: chess* *.o insts.txt \ *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py \ No newline at end of file + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE \ No newline at end of file diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit index 1eeef90b94..5220b6f5e4 100644 --- a/programming_examples/ml/conv2d/run.lit +++ b/programming_examples/ml/conv2d/run.lit @@ -1,4 +1,4 @@ -// (c) Copyright 2023 Advanced Micro Devices, Inc. +// (c) Copyright 2024 Advanced Micro Devices, Inc. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // REQUIRES: ryzen_ai, chess, torch diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py index 1dc847d8fe..9d8d08e763 100644 --- a/programming_examples/ml/conv2d/test.py +++ b/programming_examples/ml/conv2d/test.py @@ -14,136 +14,143 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute - +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "conv2d" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("int8") - -shape_total_wts = (4096, 1) -shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' -shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_out = (32, 8, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor) -int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor) -conv_scale = 7.6294e-06 # scale to convert int8 output to floating point -int8_scale = 0.0078 # scale to convert int8 output to floating point -min = -128 -max = 127 -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class conv2d_int_model(nn.Module): - def __init__(self, in_planes=64, planes=64): - super(conv2d_int_model, self).__init__() - self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) - - def forward(self, x): - out_int = self.conv(x) - out_quant = out_int * conv_scale # int8 x int8 leads to int32 output - out_float = int8_scale * torch.clamp( - torch.round(out_quant / int8_scale), min, max - ) # converting to int8 range - return out_float - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = conv2d_int_model() -model.eval() -model.conv.weight.data.copy_(int_weight) - -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") -total_wts = np.concatenate((wts1), axis=None) -total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 8, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(64, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ - -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=2 * int8_scale, -) -print("\nPASS!\n") +def main(opts): + design = "conv2d" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("int8") + + shape_total_wts = (4096, 1) + shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' + shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_out = (32, 8, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor) + int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor) + conv_scale = 7.6294e-06 # scale to convert int8 output to floating point + int8_scale = 0.0078 # scale to convert int8 output to floating point + min = -128 + max = 127 + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class conv2d_int_model(nn.Module): + def __init__(self, in_planes=64, planes=64): + super(conv2d_int_model, self).__init__() + self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) + + def forward(self, x): + out_int = self.conv(x) + out_quant = out_int * conv_scale # int8 x int8 leads to int32 output + out_float = int8_scale * torch.clamp( + torch.round(out_quant / int8_scale), min, max + ) # converting to int8 range + return out_float + + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = conv2d_int_model() + model.eval() + model.conv.weight.data.copy_(int_weight) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") + total_wts = np.concatenate((wts1), axis=None) + total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 8, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(64, 32, 32) + ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=2 * int8_scale, + ) + print("\nPASS!\n") + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) From 7f9441018937f5440943e075cf71588d52d5f1b6 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Mon, 22 Apr 2024 11:59:17 -0600 Subject: [PATCH 08/11] runtime argument fixes --- programming_examples/ml/bottleneck/Makefile | 2 +- programming_examples/ml/bottleneck/test.py | 348 ++++---- .../ml/conv2d_fused_relu/Makefile | 2 +- .../ml/conv2d_fused_relu/test.py | 270 +++--- .../ml/resnet/layers_conv2_x/Makefile | 2 +- .../ml/resnet/layers_conv2_x/test.py | 834 +++++++++--------- 6 files changed, 738 insertions(+), 720 deletions(-) diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile index f5c6e4561f..47ca6a78f7 100755 --- a/programming_examples/ml/bottleneck/Makefile +++ b/programming_examples/ml/bottleneck/Makefile @@ -37,4 +37,4 @@ clean: *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE \ No newline at end of file diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py index 34f6347175..2613acbab2 100644 --- a/programming_examples/ml/bottleneck/test.py +++ b/programming_examples/ml/bottleneck/test.py @@ -14,177 +14,183 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute - +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "bottleneck_int8" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("uint8") - -shape_in_act = (32, 32, 32, 8) -shape_in_wts1 = (8, 32, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_in_wts2 = (8, 8, 3, 3, 8, 8) # out,in,ky,kx,in8,out8 -shape_in_wts3 = (32, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_total_wts = (69632, 1) -shape_out = (32, 32, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor) -int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor) -int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor) -int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor) - -inp_scale1 = 0.5 -inp_scale2 = 0.5 -inp_scale3 = 0.5 -inp_scale4 = 0.5 - -weight_scale1 = 0.5 -weight_scale2 = 0.5 -weight_scale3 = 0.5 - -combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2) -combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3) -combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1) -combined_scale4 = -math.log2(inp_scale1 / inp_scale4) -conv_scale = 0.0039 # scale to convert int8 output to floating point -relu_scale = 0.0078 # scale to convert int8 output to floating point -min = 0 -max = 255 - -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class bottleneck_int8(nn.Module): - def __init__(self, in_planes=256, planes=64): - super(bottleneck_int8, self).__init__() - self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False) - self.conv2 = nn.Conv2d( - 64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False) - - self.relu1 = nn.ReLU() - self.relu2 = nn.ReLU() - self.relu3 = nn.ReLU() - - def forward(self, x): - conv1_out = self.conv1(x) * inp_scale1 * weight_scale1 - relu1_out = torch.clamp( - torch.round(self.relu1(conv1_out) / inp_scale2), min, max - ) # convert to int and apply relu - conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2 - relu2_out = torch.clamp( - torch.round(self.relu2(conv2_out) / inp_scale3), min, max - ) - conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3 - same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127) - skip_add = inp_scale1 * (same_scale_init + int_inp) - final_out = inp_scale4 * ( - torch.clamp(torch.round(skip_add / inp_scale4), min, max) - ) - return final_out - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = bottleneck_int8() -model.eval() -model.conv1.weight.data.copy_(int_weight1) -model.conv2.weight.data.copy_(int_weight2) -model.conv3.weight.data.copy_(int_weight3) - -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") -wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") -wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") - -total_wts = np.concatenate((wts1, wts2, wts3), axis=None) -total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4 - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 32, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(256, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=inp_scale4, -) - -print("\nPASS!\n") +def main(opts): + design = "bottleneck_int8" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_in_act = (32, 32, 32, 8) + shape_in_wts1 = (8, 32, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_in_wts2 = (8, 8, 3, 3, 8, 8) # out,in,ky,kx,in8,out8 + shape_in_wts3 = (32, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_total_wts = (69632, 1) + shape_out = (32, 32, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor) + int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor) + int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor) + int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor) + + inp_scale1 = 0.5 + inp_scale2 = 0.5 + inp_scale3 = 0.5 + inp_scale4 = 0.5 + + weight_scale1 = 0.5 + weight_scale2 = 0.5 + weight_scale3 = 0.5 + + combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2) + combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3) + combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1) + combined_scale4 = -math.log2(inp_scale1 / inp_scale4) + conv_scale = 0.0039 # scale to convert int8 output to floating point + relu_scale = 0.0078 # scale to convert int8 output to floating point + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class bottleneck_int8(nn.Module): + def __init__(self, in_planes=256, planes=64): + super(bottleneck_int8, self).__init__() + self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d( + 64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False) + + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + + def forward(self, x): + conv1_out = self.conv1(x) * inp_scale1 * weight_scale1 + relu1_out = torch.clamp( + torch.round(self.relu1(conv1_out) / inp_scale2), min, max + ) # convert to int and apply relu + conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2 + relu2_out = torch.clamp( + torch.round(self.relu2(conv2_out) / inp_scale3), min, max + ) + conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3 + same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127) + skip_add = inp_scale1 * (same_scale_init + int_inp) + final_out = inp_scale4 * ( + torch.clamp(torch.round(skip_add / inp_scale4), min, max) + ) + return final_out + + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = bottleneck_int8() + model.eval() + model.conv1.weight.data.copy_(int_weight1) + model.conv2.weight.data.copy_(int_weight2) + model.conv3.weight.data.copy_(int_weight3) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") + wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") + wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") + + total_wts = np.concatenate((wts1, wts2, wts3), axis=None) + total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4 + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 32, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(256, 32, 32) + ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=inp_scale4, + ) + + print("\nPASS!\n") + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile index 80cb34dc08..5911238a7a 100755 --- a/programming_examples/ml/conv2d_fused_relu/Makefile +++ b/programming_examples/ml/conv2d_fused_relu/Makefile @@ -32,4 +32,4 @@ clean: *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py index 5bfe139112..05ea92d677 100644 --- a/programming_examples/ml/conv2d_fused_relu/test.py +++ b/programming_examples/ml/conv2d_fused_relu/test.py @@ -14,138 +14,144 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute - +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "conv2d_with_relu" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("uint8") - -shape_total_wts = (4096, 1) -shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' -shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_out = (32, 8, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor) -int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor) -conv_scale = 0.0039 # scale to convert int8 output to floating point -relu_scale = 0.0078 # scale to convert int8 output to floating point -min = 0 -max = 255 - -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class conv2d_relu_int_model(nn.Module): - def __init__(self, in_planes=64, planes=64): - super(conv2d_relu_int_model, self).__init__() - self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - out_int = self.conv(x) - out_float = out_int * conv_scale - out_int = self.relu(out_float) - out_float = relu_scale * torch.clamp( - torch.round(out_int / relu_scale), min, max - ) # converting to int to do proper clipping - return out_float - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = conv2d_relu_int_model() -model.eval() -model.conv.weight.data.copy_(int_weight) -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") -total_wts = np.concatenate((wts1), axis=None) -total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 8, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(64, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=2 * relu_scale, -) - -print("\nPASS!\n") +def main(opts): + design = "conv2d_with_relu" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_total_wts = (4096, 1) + shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' + shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_out = (32, 8, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor) + int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor) + conv_scale = 0.0039 # scale to convert int8 output to floating point + relu_scale = 0.0078 # scale to convert int8 output to floating point + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class conv2d_relu_int_model(nn.Module): + def __init__(self, in_planes=64, planes=64): + super(conv2d_relu_int_model, self).__init__() + self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + out_int = self.conv(x) + out_float = out_int * conv_scale + out_int = self.relu(out_float) + out_float = relu_scale * torch.clamp( + torch.round(out_int / relu_scale), min, max + ) # converting to int to do proper clipping + return out_float + + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = conv2d_relu_int_model() + model.eval() + model.conv.weight.data.copy_(int_weight) + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") + total_wts = np.concatenate((wts1), axis=None) + total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 8, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(64, 32, 32) + ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=2 * relu_scale, + ) + + print("\nPASS!\n") + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile index d8f1b7261a..6218e61fb5 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/Makefile +++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile @@ -44,4 +44,4 @@ clean: *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE \ No newline at end of file diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py index 02dc01b127..5784a4d30a 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/test.py +++ b/programming_examples/ml/resnet/layers_conv2_x/test.py @@ -14,423 +14,429 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute - +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "resnet_conv2_x_int8" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("uint8") - -shape_in_act = (32, 8, 32, 8) -shape_total_wts = (212992, 1) -shape_out = (32, 32, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) -block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) -block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) -block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) -block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) - -block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor) -block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) -block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor) - -block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor) -block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) -block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor) - -init_scale = 0.5 -block_0_relu_1 = 0.5 -block_0_relu_2 = 0.5 -block_0_relu_3 = 0.5 - -block_0_weight_scale1 = 0.5 -block_0_weight_scale2 = 0.5 -block_0_weight_scale3 = 0.5 -block_0_weight_scale_skip = 0.5 - -block_1_relu_1 = 0.5 -block_1_relu_2 = 0.5 -block_1_relu_3 = 0.5 - -block_1_weight_scale1 = 0.5 -block_1_weight_scale2 = 0.5 -block_1_weight_scale3 = 0.5 -block_1_quant_add_1 = 0.5 - -block_2_relu_1 = 0.5 -block_2_relu_2 = 0.5 -block_2_relu_3 = 0.5 - -block_2_weight_scale1 = 0.5 -block_2_weight_scale2 = 0.5 -block_2_weight_scale3 = 0.5 -block_2_quant_add_1 = 0.5 - -block_0_combined_scale1 = -math.log2( - init_scale * block_0_weight_scale1 / block_0_relu_1 -) # RHS after first conv1x1 | clip 0-->255 -block_0_combined_scale2 = -math.log2( - block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2 -) # RHS after second conv3x3 | clip 0-->255 -block_0_combined_scale3 = -math.log2( - block_0_relu_2 * block_0_weight_scale3 / init_scale -) # RHS after third conv1x1 | clip -128-->+127 -block_0_combined_scale_skip = -math.log2( - init_scale * block_0_weight_scale_skip / init_scale -) # LHS after conv1x1 | clip -128-->+127 -block_0_combined_scale4 = -math.log2( - init_scale / block_0_relu_3 -) # After addition | clip 0-->255 - -block_1_combined_scale1 = -math.log2( - block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1 -) # RHS after first conv1x1 | clip 0-->255 -block_1_combined_scale2 = -math.log2( - block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2 -) # RHS after second conv3x3 | clip 0-->255 -block_1_combined_scale3 = -math.log2( - block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1 -) # RHS after third conv1x1 | clip -128-->+127 -block_1_combined_scale4 = -math.log2( - block_1_quant_add_1 / block_1_relu_3 -) # After addition | clip 0-->255 - -block_2_combined_scale1 = -math.log2( - block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1 -) # RHS after first conv1x1 | clip 0-->255 -block_2_combined_scale2 = -math.log2( - block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2 -) # RHS after second conv3x3 | clip 0-->255 -block_2_combined_scale3 = -math.log2( - block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1 -) # RHS after third conv1x1 | clip -128-->+127 -block_2_combined_scale4 = -math.log2( - block_2_quant_add_1 / block_2_relu_3 -) # After addition | clip 0-->255 - -min = 0 -max = 255 - -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class resnet_conv2_x_int8(nn.Module): - expansion = 4 - - def __init__(self, in_planes=64, planes=64): - super(resnet_conv2_x_int8, self).__init__() - - self.shortcut = nn.Conv2d( - in_planes, self.expansion * planes, kernel_size=1, bias=False - ) - # Bottleneck 0 - self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) - self.block_0_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.block_0_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_0_relu1 = nn.ReLU() - self.block_0_relu2 = nn.ReLU() - self.block_0_relu3 = nn.ReLU() - - # Bottleneck 1 - self.block_1_conv1 = nn.Conv2d( - self.expansion * planes, planes, kernel_size=1, bias=False - ) - self.block_1_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.block_1_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_1_relu1 = nn.ReLU() - self.block_1_relu2 = nn.ReLU() - self.block_1_relu3 = nn.ReLU() - - # Bottleneck 2 - self.block_2_conv1 = nn.Conv2d( - self.expansion * planes, planes, kernel_size=1, bias=False - ) - self.block_2_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.block_2_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_2_relu1 = nn.ReLU() - self.block_2_relu2 = nn.ReLU() - self.block_2_relu3 = nn.ReLU() - - def forward(self, x): - # **************** Bottleneck 0 **************** - block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1 - block_0_relu1_out = torch.clamp( - torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), - min, - max, - ) # convert to int and apply relu - block_0_conv2_out = ( - self.block_0_conv2(block_0_relu1_out) - * block_0_relu_1 - * block_0_weight_scale2 - ) - block_0_relu2_out = torch.clamp( - torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2), - min, - max, - ) - block_0_conv3_out = ( - self.block_0_conv3(block_0_relu2_out) - * block_0_relu_2 - * block_0_weight_scale3 - ) - block_0_rhf_same_scale = torch.clamp( - torch.round(block_0_conv3_out / init_scale), -128, 127 - ) - - block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip - block_0_lhs_same_scale = torch.clamp( - torch.round(block_0_lhs_conv / init_scale), -128, 127 - ) - # convert to int and apply relu - - block_0_skip_add = init_scale * ( - block_0_rhf_same_scale + block_0_lhs_same_scale - ) - block_0_final_out = torch.clamp( - torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max - ) - # **************** Bottleneck 1 **************** - block_1_conv1_out = ( - self.block_1_conv1(block_0_final_out) - * block_0_relu_3 - * block_1_weight_scale1 - ) - block_1_relu1_out = torch.clamp( - torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1), - min, - max, - ) # convert to int and apply relu - block_1_conv2_out = ( - self.block_1_conv2(block_1_relu1_out) - * block_1_relu_1 - * block_1_weight_scale2 - ) - block_1_relu2_out = torch.clamp( - torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2), - min, - max, - ) - block_1_conv3_out = ( - self.block_1_conv3(block_1_relu2_out) - * block_1_relu_2 - * block_1_weight_scale3 - ) - block_1_rhf_same_scale = torch.clamp( - torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 - ) - - block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out) - block_1_final_out = torch.clamp( - torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max - ) - - # **************** Bottleneck 2 **************** - block_2_conv1_out = ( - self.block_2_conv1(block_1_final_out) - * block_1_relu_3 - * block_2_weight_scale1 - ) - block_2_relu1_out = torch.clamp( - torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1), - min, - max, - ) # convert to int and apply relu - block_2_conv2_out = ( - self.block_2_conv2(block_2_relu1_out) - * block_2_relu_1 - * block_2_weight_scale2 - ) - block_2_relu2_out = torch.clamp( - torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2), - min, - max, - ) - block_2_conv3_out = ( - self.block_2_conv3(block_2_relu2_out) - * block_2_relu_2 - * block_2_weight_scale3 - ) - block_2_rhf_same_scale = torch.clamp( - torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 - ) - - block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out) - block_2_final_out = block_2_relu_3 * ( - torch.clamp( - torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), +def main(opts): + design = "resnet_conv2_x_int8" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_in_act = (32, 8, 32, 8) + shape_total_wts = (212992, 1) + shape_out = (32, 32, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) + block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) + block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) + block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) + block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) + + block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor) + block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) + block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor) + + block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor) + block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) + block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor) + + init_scale = 0.5 + block_0_relu_1 = 0.5 + block_0_relu_2 = 0.5 + block_0_relu_3 = 0.5 + + block_0_weight_scale1 = 0.5 + block_0_weight_scale2 = 0.5 + block_0_weight_scale3 = 0.5 + block_0_weight_scale_skip = 0.5 + + block_1_relu_1 = 0.5 + block_1_relu_2 = 0.5 + block_1_relu_3 = 0.5 + + block_1_weight_scale1 = 0.5 + block_1_weight_scale2 = 0.5 + block_1_weight_scale3 = 0.5 + block_1_quant_add_1 = 0.5 + + block_2_relu_1 = 0.5 + block_2_relu_2 = 0.5 + block_2_relu_3 = 0.5 + + block_2_weight_scale1 = 0.5 + block_2_weight_scale2 = 0.5 + block_2_weight_scale3 = 0.5 + block_2_quant_add_1 = 0.5 + + block_0_combined_scale1 = -math.log2( + init_scale * block_0_weight_scale1 / block_0_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_0_combined_scale2 = -math.log2( + block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_0_combined_scale3 = -math.log2( + block_0_relu_2 * block_0_weight_scale3 / init_scale + ) # RHS after third conv1x1 | clip -128-->+127 + block_0_combined_scale_skip = -math.log2( + init_scale * block_0_weight_scale_skip / init_scale + ) # LHS after conv1x1 | clip -128-->+127 + block_0_combined_scale4 = -math.log2( + init_scale / block_0_relu_3 + ) # After addition | clip 0-->255 + + block_1_combined_scale1 = -math.log2( + block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_1_combined_scale2 = -math.log2( + block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_1_combined_scale3 = -math.log2( + block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1 + ) # RHS after third conv1x1 | clip -128-->+127 + block_1_combined_scale4 = -math.log2( + block_1_quant_add_1 / block_1_relu_3 + ) # After addition | clip 0-->255 + + block_2_combined_scale1 = -math.log2( + block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_2_combined_scale2 = -math.log2( + block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_2_combined_scale3 = -math.log2( + block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1 + ) # RHS after third conv1x1 | clip -128-->+127 + block_2_combined_scale4 = -math.log2( + block_2_quant_add_1 / block_2_relu_3 + ) # After addition | clip 0-->255 + + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class resnet_conv2_x_int8(nn.Module): + expansion = 4 + + def __init__(self, in_planes=64, planes=64): + super(resnet_conv2_x_int8, self).__init__() + + self.shortcut = nn.Conv2d( + in_planes, self.expansion * planes, kernel_size=1, bias=False + ) + # Bottleneck 0 + self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.block_0_conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.block_0_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_0_relu1 = nn.ReLU() + self.block_0_relu2 = nn.ReLU() + self.block_0_relu3 = nn.ReLU() + + # Bottleneck 1 + self.block_1_conv1 = nn.Conv2d( + self.expansion * planes, planes, kernel_size=1, bias=False + ) + self.block_1_conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.block_1_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_1_relu1 = nn.ReLU() + self.block_1_relu2 = nn.ReLU() + self.block_1_relu3 = nn.ReLU() + + # Bottleneck 2 + self.block_2_conv1 = nn.Conv2d( + self.expansion * planes, planes, kernel_size=1, bias=False + ) + self.block_2_conv2 = nn.Conv2d( + planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.block_2_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_2_relu1 = nn.ReLU() + self.block_2_relu2 = nn.ReLU() + self.block_2_relu3 = nn.ReLU() + + def forward(self, x): + # **************** Bottleneck 0 **************** + block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1 + block_0_relu1_out = torch.clamp( + torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), + min, + max, + ) # convert to int and apply relu + block_0_conv2_out = ( + self.block_0_conv2(block_0_relu1_out) + * block_0_relu_1 + * block_0_weight_scale2 + ) + block_0_relu2_out = torch.clamp( + torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2), min, max, ) - ) - return block_2_final_out - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = resnet_conv2_x_int8() -model.eval() -model.block_0_conv1.weight.data.copy_(block_0_int_weight_1) -model.block_0_conv2.weight.data.copy_(block_0_int_weight_2) -model.block_0_conv3.weight.data.copy_(block_0_int_weight_3) -model.shortcut.weight.data.copy_(block_0_int_weight_skip) - -model.block_1_conv1.weight.data.copy_(block_1_int_weight_1) -model.block_1_conv2.weight.data.copy_(block_1_int_weight_2) -model.block_1_conv3.weight.data.copy_(block_1_int_weight_3) - -model.block_2_conv1.weight.data.copy_(block_2_int_weight_1) -model.block_2_conv2.weight.data.copy_(block_2_int_weight_2) -model.block_2_conv3.weight.data.copy_(block_2_int_weight_3) - -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -block0_wts1 = ds.reorder_mat( - block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block0_wts2 = ds.reorder_mat( - block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block0_wts3 = ds.reorder_mat( - block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block0_wts_skip = ds.reorder_mat( - block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) - -total_wts = np.concatenate( - (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None -) - -block1_wts1 = ds.reorder_mat( - block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block1_wts2 = ds.reorder_mat( - block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block1_wts3 = ds.reorder_mat( - block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) - -total_wts2 = np.concatenate( - (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None -) - -block2_wts1 = ds.reorder_mat( - block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block2_wts2 = ds.reorder_mat( - block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block2_wts3 = ds.reorder_mat( - block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) - -total_wts3 = np.concatenate( - (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None -) - -total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3 - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 32, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(256, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=block_2_relu_3, -) - -print("\nPASS!\n") + block_0_conv3_out = ( + self.block_0_conv3(block_0_relu2_out) + * block_0_relu_2 + * block_0_weight_scale3 + ) + block_0_rhf_same_scale = torch.clamp( + torch.round(block_0_conv3_out / init_scale), -128, 127 + ) + + block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip + block_0_lhs_same_scale = torch.clamp( + torch.round(block_0_lhs_conv / init_scale), -128, 127 + ) + # convert to int and apply relu + + block_0_skip_add = init_scale * ( + block_0_rhf_same_scale + block_0_lhs_same_scale + ) + block_0_final_out = torch.clamp( + torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max + ) + # **************** Bottleneck 1 **************** + block_1_conv1_out = ( + self.block_1_conv1(block_0_final_out) + * block_0_relu_3 + * block_1_weight_scale1 + ) + block_1_relu1_out = torch.clamp( + torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1), + min, + max, + ) # convert to int and apply relu + block_1_conv2_out = ( + self.block_1_conv2(block_1_relu1_out) + * block_1_relu_1 + * block_1_weight_scale2 + ) + block_1_relu2_out = torch.clamp( + torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2), + min, + max, + ) + block_1_conv3_out = ( + self.block_1_conv3(block_1_relu2_out) + * block_1_relu_2 + * block_1_weight_scale3 + ) + block_1_rhf_same_scale = torch.clamp( + torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 + ) + + block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out) + block_1_final_out = torch.clamp( + torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max + ) + + # **************** Bottleneck 2 **************** + block_2_conv1_out = ( + self.block_2_conv1(block_1_final_out) + * block_1_relu_3 + * block_2_weight_scale1 + ) + block_2_relu1_out = torch.clamp( + torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1), + min, + max, + ) # convert to int and apply relu + block_2_conv2_out = ( + self.block_2_conv2(block_2_relu1_out) + * block_2_relu_1 + * block_2_weight_scale2 + ) + block_2_relu2_out = torch.clamp( + torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2), + min, + max, + ) + block_2_conv3_out = ( + self.block_2_conv3(block_2_relu2_out) + * block_2_relu_2 + * block_2_weight_scale3 + ) + block_2_rhf_same_scale = torch.clamp( + torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 + ) + + block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out) + block_2_final_out = block_2_relu_3 * ( + torch.clamp( + torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), + min, + max, + ) + ) + return block_2_final_out + + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = resnet_conv2_x_int8() + model.eval() + model.block_0_conv1.weight.data.copy_(block_0_int_weight_1) + model.block_0_conv2.weight.data.copy_(block_0_int_weight_2) + model.block_0_conv3.weight.data.copy_(block_0_int_weight_3) + model.shortcut.weight.data.copy_(block_0_int_weight_skip) + + model.block_1_conv1.weight.data.copy_(block_1_int_weight_1) + model.block_1_conv2.weight.data.copy_(block_1_int_weight_2) + model.block_1_conv3.weight.data.copy_(block_1_int_weight_3) + + model.block_2_conv1.weight.data.copy_(block_2_int_weight_1) + model.block_2_conv2.weight.data.copy_(block_2_int_weight_2) + model.block_2_conv3.weight.data.copy_(block_2_int_weight_3) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + block0_wts1 = ds.reorder_mat( + block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts2 = ds.reorder_mat( + block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts3 = ds.reorder_mat( + block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts_skip = ds.reorder_mat( + block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts = np.concatenate( + (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None + ) + + block1_wts1 = ds.reorder_mat( + block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts2 = ds.reorder_mat( + block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts3 = ds.reorder_mat( + block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts2 = np.concatenate( + (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None + ) + + block2_wts1 = ds.reorder_mat( + block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts2 = ds.reorder_mat( + block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts3 = ds.reorder_mat( + block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts3 = np.concatenate( + (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None + ) + + total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3 + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 32, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(256, 32, 32) + ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=block_2_relu_3, + ) + + print("\nPASS!\n") + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) From 223ed0aa4c5d17e3768ac13ccb153f84df444e32 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 20:31:29 -0600 Subject: [PATCH 09/11] Black files --- programming_examples/ml/bottleneck/test.py | 21 +++-- programming_examples/ml/conv2d/test.py | 16 ++-- .../ml/conv2d_fused_relu/test.py | 17 ++-- .../ml/resnet/layers_conv2_x/test.py | 82 ++++++++++++++----- 4 files changed, 101 insertions(+), 35 deletions(-) diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py index 2613acbab2..48a9a8929c 100644 --- a/programming_examples/ml/bottleneck/test.py +++ b/programming_examples/ml/bottleneck/test.py @@ -15,9 +15,11 @@ import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute import aie.utils.test as test_utils + torch.use_deterministic_algorithms(True) torch.manual_seed(0) + def main(opts): design = "bottleneck_int8" xclbin_path = opts.xclbin @@ -90,7 +92,6 @@ def main(opts): trace_size=trace_size, ) - # ------------------------------------------------------ # Define your golden reference # ------------------------------------------------------ @@ -117,14 +118,15 @@ def forward(self, x): torch.round(self.relu2(conv2_out) / inp_scale3), min, max ) conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3 - same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127) + same_scale_init = torch.clamp( + torch.round(conv3_out / inp_scale1), -128, 127 + ) skip_add = inp_scale1 * (same_scale_init + int_inp) final_out = inp_scale4 * ( torch.clamp(torch.round(skip_add / inp_scale4), min, max) ) return final_out - # ------------------------------------------------------ # Pytorch baseline # ------------------------------------------------------ @@ -141,7 +143,9 @@ def forward(self, x): # ------------------------------------------------------ ds = DataShaper() before_input = int_inp.squeeze().data.numpy().astype(dtype_in) - before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") @@ -161,7 +165,9 @@ def forward(self, x): stop = time.time_ns() if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) write_out_trace(trace, trace_file) npu_time = stop - start @@ -173,7 +179,9 @@ def forward(self, x): temp_out = aie_output.reshape(32, 32, 32, 8) temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") ofm_mem_fmt = temp_out.reshape(256, 32, 32) - ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) # ------------------------------------------------------ @@ -190,6 +198,7 @@ def forward(self, x): print("\nPASS!\n") + if __name__ == "__main__": p = test_utils.create_default_argparser() opts = p.parse_args(sys.argv[1:]) diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py index 9d8d08e763..1a8d2e7712 100644 --- a/programming_examples/ml/conv2d/test.py +++ b/programming_examples/ml/conv2d/test.py @@ -15,9 +15,11 @@ import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute import aie.utils.test as test_utils + torch.use_deterministic_algorithms(True) torch.manual_seed(0) + def main(opts): design = "conv2d" xclbin_path = opts.xclbin @@ -71,7 +73,6 @@ def main(opts): trace_size=trace_size, ) - # ------------------------------------------------------ # Define your golden reference # ------------------------------------------------------ @@ -88,7 +89,6 @@ def forward(self, x): ) # converting to int8 range return out_float - # ------------------------------------------------------ # Pytorch baseline # ------------------------------------------------------ @@ -103,7 +103,9 @@ def forward(self, x): # ------------------------------------------------------ ds = DataShaper() before_input = int_inp.squeeze().data.numpy().astype(dtype_in) - before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") @@ -120,7 +122,9 @@ def forward(self, x): stop = time.time_ns() if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) write_out_trace(trace, trace_file) npu_time = stop - start @@ -132,7 +136,9 @@ def forward(self, x): temp_out = aie_output.reshape(32, 8, 32, 8) temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") ofm_mem_fmt = temp_out.reshape(64, 32, 32) - ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) # ------------------------------------------------------ diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py index 05ea92d677..6fe407faaa 100644 --- a/programming_examples/ml/conv2d_fused_relu/test.py +++ b/programming_examples/ml/conv2d_fused_relu/test.py @@ -15,9 +15,11 @@ import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute import aie.utils.test as test_utils + torch.use_deterministic_algorithms(True) torch.manual_seed(0) + def main(opts): design = "conv2d_with_relu" xclbin_path = opts.xclbin @@ -72,7 +74,6 @@ def main(opts): trace_size=trace_size, ) - # ------------------------------------------------------ # Define your golden reference # ------------------------------------------------------ @@ -91,7 +92,6 @@ def forward(self, x): ) # converting to int to do proper clipping return out_float - # ------------------------------------------------------ # Pytorch baseline # ------------------------------------------------------ @@ -105,7 +105,9 @@ def forward(self, x): # ------------------------------------------------------ ds = DataShaper() before_input = int_inp.squeeze().data.numpy().astype(dtype_in) - before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") @@ -122,7 +124,9 @@ def forward(self, x): stop = time.time_ns() if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) write_out_trace(trace, trace_file) npu_time = stop - start @@ -134,7 +138,9 @@ def forward(self, x): temp_out = aie_output.reshape(32, 8, 32, 8) temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") ofm_mem_fmt = temp_out.reshape(64, 32, 32) - ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) # ------------------------------------------------------ @@ -151,6 +157,7 @@ def forward(self, x): print("\nPASS!\n") + if __name__ == "__main__": p = test_utils.create_default_argparser() opts = p.parse_args(sys.argv[1:]) diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py index 5784a4d30a..48b45b99ae 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/test.py +++ b/programming_examples/ml/resnet/layers_conv2_x/test.py @@ -15,9 +15,11 @@ import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute import aie.utils.test as test_utils + torch.use_deterministic_algorithms(True) torch.manual_seed(0) + def main(opts): design = "resnet_conv2_x_int8" xclbin_path = opts.xclbin @@ -51,16 +53,28 @@ def main(opts): int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) - block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) - block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) + block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type( + torch.FloatTensor + ) - block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor) + block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type( + torch.FloatTensor + ) block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) - block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor) + block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type( + torch.FloatTensor + ) - block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor) + block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type( + torch.FloatTensor + ) block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) - block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor) + block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type( + torch.FloatTensor + ) init_scale = 0.5 block_0_relu_1 = 0.5 @@ -151,7 +165,6 @@ def main(opts): trace_size=trace_size, ) - # ------------------------------------------------------ # Define your golden reference # ------------------------------------------------------ @@ -167,7 +180,12 @@ def __init__(self, in_planes=64, planes=64): # Bottleneck 0 self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) self.block_0_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, ) self.block_0_conv3 = nn.Conv2d( planes, self.expansion * planes, kernel_size=1, bias=False @@ -182,7 +200,12 @@ def __init__(self, in_planes=64, planes=64): self.expansion * planes, planes, kernel_size=1, bias=False ) self.block_1_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, ) self.block_1_conv3 = nn.Conv2d( planes, self.expansion * planes, kernel_size=1, bias=False @@ -197,7 +220,12 @@ def __init__(self, in_planes=64, planes=64): self.expansion * planes, planes, kernel_size=1, bias=False ) self.block_2_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, ) self.block_2_conv3 = nn.Conv2d( planes, self.expansion * planes, kernel_size=1, bias=False @@ -209,7 +237,9 @@ def __init__(self, in_planes=64, planes=64): def forward(self, x): # **************** Bottleneck 0 **************** - block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1 + block_0_conv1_out = ( + self.block_0_conv1(x) * init_scale * block_0_weight_scale1 + ) block_0_relu1_out = torch.clamp( torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), min, @@ -244,7 +274,9 @@ def forward(self, x): block_0_rhf_same_scale + block_0_lhs_same_scale ) block_0_final_out = torch.clamp( - torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max + torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), + min, + max, ) # **************** Bottleneck 1 **************** block_1_conv1_out = ( @@ -276,9 +308,13 @@ def forward(self, x): torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 ) - block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out) + block_1_skip_add = block_0_relu_3 * ( + block_1_rhf_same_scale + block_0_final_out + ) block_1_final_out = torch.clamp( - torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max + torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), + min, + max, ) # **************** Bottleneck 2 **************** @@ -311,7 +347,9 @@ def forward(self, x): torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 ) - block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out) + block_2_skip_add = block_1_relu_3 * ( + block_2_rhf_same_scale + block_1_final_out + ) block_2_final_out = block_2_relu_3 * ( torch.clamp( torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), @@ -321,7 +359,6 @@ def forward(self, x): ) return block_2_final_out - # ------------------------------------------------------ # Pytorch baseline # ------------------------------------------------------ @@ -347,7 +384,9 @@ def forward(self, x): # ------------------------------------------------------ ds = DataShaper() before_input = int_inp.squeeze().data.numpy().astype(dtype_in) - before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") @@ -407,7 +446,9 @@ def forward(self, x): stop = time.time_ns() if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) write_out_trace(trace, trace_file) npu_time = stop - start @@ -419,7 +460,9 @@ def forward(self, x): temp_out = aie_output.reshape(32, 32, 32, 8) temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") ofm_mem_fmt = temp_out.reshape(256, 32, 32) - ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) # ------------------------------------------------------ @@ -436,6 +479,7 @@ def forward(self, x): print("\nPASS!\n") + if __name__ == "__main__": p = test_utils.create_default_argparser() opts = p.parse_args(sys.argv[1:]) From 284b6114e9f10995430de7b1523aceb709f267f2 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 20:33:03 -0600 Subject: [PATCH 10/11] Fix run.lit --- programming_examples/ml/bottleneck/run.lit | 2 +- programming_examples/ml/conv2d/run.lit | 2 +- programming_examples/ml/conv2d_fused_relu/run.lit | 2 +- programming_examples/ml/resnet/layers_conv2_x/run.lit | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit index ec30002c97..8a6024d66e 100644 --- a/programming_examples/ml/bottleneck/run.lit +++ b/programming_examples/ml/bottleneck/run.lit @@ -8,5 +8,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit index 5220b6f5e4..349e45f9bc 100644 --- a/programming_examples/ml/conv2d/run.lit +++ b/programming_examples/ml/conv2d/run.lit @@ -6,5 +6,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit index 0c122f451e..cfddde9013 100644 --- a/programming_examples/ml/conv2d_fused_relu/run.lit +++ b/programming_examples/ml/conv2d_fused_relu/run.lit @@ -6,5 +6,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit index 61f43e45e6..c35a868772 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/run.lit +++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit @@ -10,5 +10,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file From 377082bff2a87fcb986a8ef564d5e03b0fd4304b Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 20:41:10 -0600 Subject: [PATCH 11/11] Fix resnet includes --- programming_examples/ml/resnet/layers_conv2_x/aie2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 235b5c5308..f5243070d9 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -7,8 +7,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * +from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith -from aie.dialects.scf import for_, yield_ from aie.extras.context import mlir_mod_ctx from aie.ir import MemRefType, TypeAttr