From 496a79dd45f75f6434754aecf263baeffcc3ebb4 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Mon, 22 Apr 2024 10:14:36 -0600
Subject: [PATCH 01/11] Add torch to lit cfg for programming_examples

---
 .github/workflows/buildAndTestRyzenAI.yml | 1 +
 programming_examples/lit.cfg.py           | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml
index acf2262fa2..bc3988e002 100644
--- a/.github/workflows/buildAndTestRyzenAI.yml
+++ b/.github/workflows/buildAndTestRyzenAI.yml
@@ -127,6 +127,7 @@ jobs:
           python -m venv aie-venv
           source aie-venv/bin/activate
           pip install -r python/requirements.txt
+          pip install -r python/requirements_ml.txt
           pip install jupyter
           sed -i.bak 's/OUTPUT_TIMEOUT = 10/OUTPUT_TIMEOUT = 100/g' \
             $(python -c 'import site; print(site.getsitepackages()[0])')/jupyter_client/runapp.py
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index b774bc5280..61acb45937 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -165,6 +165,14 @@
     opencv_flags = ""
 config.substitutions.append(("%opencv_flags", opencv_flags))
 
+try:
+     import torch
+
+     config.available_features.add("torch")
+ except:
+     print("torch not found")
+     pass
+
 VitisSysrootFlag = ""
 if "x86_64" in config.aieHostTarget:
     config.substitutions.append(("%aieHostTargetTriplet%", "x86_64-unknown-linux-gnu"))

From afa454a1c1e65fb03115897dedecb2a9b3d5628a Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Mon, 22 Apr 2024 10:25:24 -0600
Subject: [PATCH 02/11] Fixes

---
 programming_examples/lit.cfg.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 61acb45937..5e1871c57c 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -166,12 +166,12 @@
 config.substitutions.append(("%opencv_flags", opencv_flags))
 
 try:
-     import torch
+    import torch
 
-     config.available_features.add("torch")
- except:
-     print("torch not found")
-     pass
+    config.available_features.add("torch")
+except ImportError:
+    print("torch not found")
+    pass
 
 VitisSysrootFlag = ""
 if "x86_64" in config.aieHostTarget:

From f49dcff88378073ff16c288119c130d2ebed468c Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Mon, 22 Apr 2024 10:28:24 -0600
Subject: [PATCH 03/11] Update lit.cfg.py torch not found error

---
 programming_examples/lit.cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 5e1871c57c..ffb130bc42 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -170,7 +170,7 @@
 
     config.available_features.add("torch")
 except ImportError:
-    print("torch not found")
+    print("torch not found", file=sys.stder)
     pass
 
 VitisSysrootFlag = ""

From 3f4fd7db243e80a199f42daa6347b1767773c3e2 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Mon, 22 Apr 2024 10:39:49 -0600
Subject: [PATCH 04/11] Fix typo

---
 programming_examples/lit.cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index ffb130bc42..a03d2c7338 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -170,7 +170,7 @@
 
     config.available_features.add("torch")
 except ImportError:
-    print("torch not found", file=sys.stder)
+    print("torch not found", file=sys.stderr)
     pass
 
 VitisSysrootFlag = ""

From a4157264de25f6742190573fe6fa9f79de457bee Mon Sep 17 00:00:00 2001
From: singagan <53442471+singagan@users.noreply.github.com>
Date: Mon, 22 Apr 2024 16:17:31 +0200
Subject: [PATCH 05/11] Resnet with python binding (#1368)

---
 .../ml/resnet/layers_conv2_x/Makefile         |    9 +-
 .../ml/resnet/layers_conv2_x/aie2.py          | 1106 ++++++++++-------
 2 files changed, 693 insertions(+), 422 deletions(-)

diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile
index 2f978a05ba..d8f1b7261a 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/Makefile
+++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile
@@ -12,13 +12,10 @@ mlirFileName = aie
 
 all: build/conv2dk1_i8.o build/conv2dk1_skip_init.o build/conv2dk3.o build/conv2dk1_skip.o build/conv2dk1_ui8.o build/final.xclbin
 
-# build/${mlirFileName}.mlir: aie2.py
-# 	mkdir -p ${@D}
-# 	python3 $< > $@
-
-build/${mlirFileName}.mlir: aie.mlir
+build/${mlirFileName}.mlir: aie2.py
 	mkdir -p ${@D}
-	cp $< $@
+	python3 $< > $@
+
 insts.txt: build/${mlirFileName}.mlir
 	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
 
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 385a4fc7a5..235b5c5308 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -8,7 +8,7 @@
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.extras.dialects.ext import memref, arith
-from aie.extras.dialects.ext.scf import range_, yield_
+from aie.dialects.scf import for_, yield_
 from aie.extras.context import mlir_mod_ctx
 from aie.ir import MemRefType, TypeAttr
 
@@ -21,21 +21,16 @@
 
 # Define bottleneck layer sizes
 
-tensorInW = 32
-tensorInH = 32
-tensorInC = 256
 
-tensorL1InC = tensorInC
-tensorL1OutC = tensorL1InC // 4
+def resnet_conv_x():
 
-tensorL2InC = tensorL1OutC
-tensorL2OutC = tensorL2InC
+    tensorInW = 32
+    tensorInH = 32
+    tensorInCInit = 64
+    tensorInCRest = 4 * tensorInCInit
+    n_cols = 3
+    repeat = 2
 
-tensorL3InC = tensorL2OutC
-tensorL3OutC = tensorL3InC * 4
-
-
-def bottleneck4AIEs():
     with mlir_mod_ctx() as ctx:
 
         @device(AIEDevice.ipu)
@@ -44,23 +39,36 @@ def deviceBody():
             # define types
             uint8_ty = IntegerType.get_unsigned(8)
             int8_ty = IntegerType.get_signless(8)
-            int16_ty = IntegerType.get_signless(16)
             int32_ty = IntegerType.get_signless(32)
 
-            tensorLayer1In_ty = MemRefType.get(
+            tensorLayer1In_ty_init = MemRefType.get(
                 (
                     tensorInW,
                     1,
-                    tensorL1InC,
+                    tensorInCInit,
                 ),
                 int8_ty,
             )
-            weightsLayer1_ty = MemRefType.get((tensorL1InC * tensorL1OutC,), int8_ty)
+            tensorLayer1In_ty_rest = MemRefType.get(
+                (
+                    tensorInW,
+                    1,
+                    tensorInCRest,
+                ),
+                uint8_ty,
+            )
+            weightsLayer1_ty_init = MemRefType.get(
+                (tensorInCInit * tensorInCInit,), int8_ty
+            )
+            weightsLayer1_ty_rest = MemRefType.get(
+                (tensorInCRest * tensorInCInit,), int8_ty
+            )
+
             tensorLayer1Out_ty = MemRefType.get(
                 (
                     tensorInW,
                     1,
-                    tensorL1OutC,
+                    tensorInCInit,
                 ),
                 uint8_ty,
             )
@@ -69,18 +77,18 @@ def deviceBody():
                 (
                     tensorInW,
                     1,
-                    tensorL2InC,
+                    tensorInCInit,
                 ),
                 uint8_ty,
             )
             weightsLayer2_ty = MemRefType.get(
-                (3 * 3 * tensorL2InC * tensorL2OutC,), int8_ty
+                (3 * 3 * tensorInCInit * tensorInCInit,), int8_ty
             )
             tensorLayer2Out_ty = MemRefType.get(
                 (
                     tensorInW,
                     1,
-                    tensorL2OutC // 2,
+                    tensorInCInit // 2,
                 ),
                 uint8_ty,
             )
@@ -89,35 +97,51 @@ def deviceBody():
                 (
                     tensorInW,
                     1,
-                    tensorL3InC // 2,
+                    tensorInCInit // 2,
                 ),
                 uint8_ty,
             )
-            weightsLayer3_ty = MemRefType.get((tensorL3InC * tensorL3OutC,), int8_ty)
+            weightsLayer3_ty_init = MemRefType.get(
+                (2 * tensorInCInit * tensorInCRest,), int8_ty
+            )
+            weightsLayer3_ty_rest = MemRefType.get(
+                (tensorInCRest // 4 * tensorInCRest,), int8_ty
+            )
+
             tensorLayer3Out_ty = MemRefType.get(
                 (
                     tensorInW,
                     1,
-                    tensorL3OutC,
+                    tensorInCRest,
                 ),
                 uint8_ty,
             )
 
-            allWeights_ty = MemRefType.get(
+            allWeights_ty_init = MemRefType.get(
+                (
+                    tensorInCInit * tensorInCInit
+                    + 3 * 3 * tensorInCInit * tensorInCInit
+                    + tensorInCInit * tensorInCRest
+                    + tensorInCInit * tensorInCRest,
+                ),
+                int8_ty,
+            )
+
+            allWeights_ty_rest = MemRefType.get(
                 (
-                    tensorL1InC * tensorL1OutC
-                    + 3 * 3 * tensorL2InC * tensorL2OutC
-                    + tensorL3InC * tensorL3OutC,
+                    tensorInCRest * tensorInCInit
+                    + 3 * 3 * tensorInCInit * tensorInCInit
+                    + tensorInCInit * tensorInCRest,
                 ),
                 int8_ty,
             )
 
             # kernel definitions
-            conv2dk1 = external_func(
+            conv2dk1_i8 = external_func(
                 "conv2dk1_i8",
                 inputs=[
-                    tensorLayer1In_ty,
-                    weightsLayer1_ty,
+                    tensorLayer1In_ty_init,
+                    weightsLayer1_ty_init,
                     tensorLayer1Out_ty,
                     int32_ty,
                     int32_ty,
@@ -143,14 +167,42 @@ def deviceBody():
                     int32_ty,
                 ],
             )
-            conv2dk1_skip = external_func(
-                "conv2dk1_skip_i8",
+            conv2dk1_skip_init_i8 = external_func(
+                "conv2dk1_skip_init_i8",
+                inputs=[
+                    tensorLayer3In_ty,
+                    tensorLayer3In_ty,
+                    weightsLayer3_ty_init,
+                    tensorLayer3Out_ty,
+                    tensorLayer1In_ty_init,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+            conv2dk1_ui8 = external_func(
+                "conv2dk1_ui8",
+                inputs=[
+                    tensorLayer3Out_ty,
+                    weightsLayer1_ty_rest,
+                    tensorLayer1Out_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                    int32_ty,
+                ],
+            )
+
+            conv2dk1_skip_ui8 = external_func(
+                "conv2dk1_skip_ui8",
                 inputs=[
                     tensorLayer3In_ty,
                     tensorLayer3In_ty,
-                    weightsLayer3_ty,
+                    weightsLayer3_ty_rest,
+                    tensorLayer3Out_ty,
                     tensorLayer3Out_ty,
-                    tensorLayer1In_ty,
                     int32_ty,
                     int32_ty,
                     int32_ty,
@@ -159,461 +211,658 @@ def deviceBody():
                 ],
             )
 
-            ShimTile = tile(0, 0)
-            MemTile = tile(0, 1)
-            ComputeTile2 = tile(0, 2)
-            ComputeTile3 = tile(0, 3)
-            ComputeTile4 = tile(0, 4)
-            ComputeTile5 = tile(0, 5)
+            ShimTile00 = tile(0, 0)
+            MemTile01 = tile(0, 1)
+            ComputeTile02 = tile(0, 2)
+            ComputeTile03 = tile(0, 3)
+            ComputeTile04 = tile(0, 4)
+            ComputeTile05 = tile(0, 5)
+
+            ShimTile10 = tile(1, 0)
+            MemTile11 = tile(1, 1)
+            ComputeTile12 = tile(1, 2)
+            ComputeTile13 = tile(1, 3)
+            ComputeTile14 = tile(1, 4)
+            ComputeTile15 = tile(1, 5)
+
+            ShimTile20 = tile(2, 0)
+            MemTile21 = tile(2, 1)
+            ComputeTile22 = tile(2, 2)
+            ComputeTile23 = tile(2, 3)
+            ComputeTile24 = tile(2, 4)
+            ComputeTile25 = tile(2, 5)
+
+            shims = [ShimTile00, ShimTile10, ShimTile20]
+            mems = [MemTile01, MemTile11, MemTile21]
+            wts_sizes = [allWeights_ty_init, allWeights_ty_rest, allWeights_ty_rest]
+            layer1_wts_sizes = [
+                weightsLayer1_ty_init,
+                weightsLayer1_ty_rest,
+                weightsLayer1_ty_rest,
+            ]
+            laye1_act_sizes = [
+                tensorLayer1In_ty_init,
+                tensorLayer1In_ty_rest,
+                tensorLayer1In_ty_rest,
+            ]
+            layer3_wts_sizes = [
+                weightsLayer3_ty_init,
+                weightsLayer3_ty_rest,
+                weightsLayer3_ty_rest,
+            ]
+
+            cores = [
+                [ComputeTile02, ComputeTile03, ComputeTile04, ComputeTile05],
+                [ComputeTile15, ComputeTile14, ComputeTile13, ComputeTile12],
+                [ComputeTile22, ComputeTile23, ComputeTile24, ComputeTile25],
+            ]
 
             if enableTrace:
-                flow(ComputeTile4, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+                flow(ComputeTile04, WireBundle.Trace, 0, ShimTile00, WireBundle.DMA, 1)
 
             # runtime parameters
 
-            rtpComputeTile2 = Buffer(ComputeTile2, [16], T.i32(), "rtpComputeTile2")
-            rtpComputeTile3 = Buffer(ComputeTile3, [16], T.i32(), "rtpComputeTile3")
-            rtpComputeTile4 = Buffer(ComputeTile4, [16], T.i32(), "rtpComputeTile4")
-            rtpComputeTile5 = Buffer(ComputeTile5, [16], T.i32(), "rtpComputeTile5")
-
+            rtpComputeTile02 = Buffer(ComputeTile02, [16], T.i32(), "rtpComputeTile02")
+            rtpComputeTile03 = Buffer(ComputeTile03, [16], T.i32(), "rtpComputeTile03")
+            rtpComputeTile04 = Buffer(ComputeTile04, [16], T.i32(), "rtpComputeTile04")
+            rtpComputeTile05 = Buffer(ComputeTile05, [16], T.i32(), "rtpComputeTile05")
+
+            rtpComputeTile12 = Buffer(ComputeTile12, [16], T.i32(), "rtpComputeTile12")
+            rtpComputeTile13 = Buffer(ComputeTile13, [16], T.i32(), "rtpComputeTile13")
+            rtpComputeTile14 = Buffer(ComputeTile14, [16], T.i32(), "rtpComputeTile14")
+            rtpComputeTile15 = Buffer(ComputeTile15, [16], T.i32(), "rtpComputeTile15")
+
+            rtpComputeTile22 = Buffer(ComputeTile22, [16], T.i32(), "rtpComputeTile22")
+            rtpComputeTile23 = Buffer(ComputeTile23, [16], T.i32(), "rtpComputeTile23")
+            rtpComputeTile24 = Buffer(ComputeTile24, [16], T.i32(), "rtpComputeTile24")
+            rtpComputeTile25 = Buffer(ComputeTile25, [16], T.i32(), "rtpComputeTile25")
+
+            rtp = [
+                [
+                    rtpComputeTile02,
+                    rtpComputeTile03,
+                    rtpComputeTile04,
+                    rtpComputeTile05,
+                ],
+                [
+                    rtpComputeTile15,
+                    rtpComputeTile14,
+                    rtpComputeTile13,
+                    rtpComputeTile12,
+                ],
+                [
+                    rtpComputeTile22,
+                    rtpComputeTile23,
+                    rtpComputeTile24,
+                    rtpComputeTile25,
+                ],
+            ]
+            rtp_name = [
+                [
+                    "rtpComputeTile02",
+                    "rtpComputeTile03",
+                    "rtpComputeTile04",
+                    "rtpComputeTile05",
+                ],
+                [
+                    "rtpComputeTile12",
+                    "rtpComputeTile13",
+                    "rtpComputeTile14",
+                    "rtpComputeTile15",
+                ],
+                [
+                    "rtpComputeTile22",
+                    "rtpComputeTile23",
+                    "rtpComputeTile24",
+                    "rtpComputeTile25",
+                ],
+            ]
             # set up data movement with OFs
+            conv1_kernels = ["conv2dk1_i8.o", "conv2dk1_ui8.o", "conv2dk1_ui8.o"]
+            conv1_kernels_call = [conv2dk1_i8, conv2dk1_ui8, conv2dk1_ui8]
+
+            conv3_kernels = [
+                "conv2dk1_skip_init.o",
+                "conv2dk1_skip.o",
+                "conv2dk1_skip.o",
+            ]
+            conv3_kernels_call = [
+                conv2dk1_skip_init_i8,
+                conv2dk1_skip_ui8,
+                conv2dk1_skip_ui8,
+            ]
+
+            act1_fifo_names = ["act1_00_02_01", "act1_04_15_01", "act1_13_22_21"]
+            act1_fifos = {}
+
+            wts_fifo_names = ["wts_0_L3L2", "wts_1_L3L2", "wts_2_L3L2"]
+            wts_fifos = {}
+            wts_sub_fifo_names = [
+                ["wts_buf_00", "wts_buf_01", "wts_buf_02"],
+                ["wts_buf_10", "wts_buf_11", "wts_buf_12"],
+                ["wts_buf_20", "wts_buf_21", "wts_buf_22"],
+            ]
+            wts_sub_fifos = {}
+
+            for i in range(n_cols):
+                wts_fifos[wts_fifo_names[i]] = object_fifo(
+                    wts_fifo_names[i], shims[i], mems[i], 1, wts_sizes[i]
+                )
+                wts_sub_fifos[wts_sub_fifo_names[i][0]] = object_fifo(
+                    wts_sub_fifo_names[i][0],
+                    mems[i],
+                    cores[i][0],
+                    1,
+                    layer1_wts_sizes[i],
+                )
+                wts_sub_fifos[wts_sub_fifo_names[i][1]] = object_fifo(
+                    wts_sub_fifo_names[i][1],
+                    mems[i],
+                    [cores[i][1], cores[i][3]],
+                    1,
+                    weightsLayer2_ty,
+                )
+                wts_sub_fifos[wts_sub_fifo_names[i][2]] = object_fifo(
+                    wts_sub_fifo_names[i][2],
+                    mems[i],
+                    cores[i][2],
+                    1,
+                    layer3_wts_sizes[i],
+                )
+                object_fifo_link(
+                    wts_fifo_names[i],
+                    [
+                        wts_sub_fifo_names[i][0],
+                        wts_sub_fifo_names[i][1],
+                        wts_sub_fifo_names[i][2],
+                    ],
+                )
+
             # input tensor (with broadcast for skip connection)
-            of_inOF_act_L3L2 = object_fifo(
-                "inOF_act_L3L2",
-                ShimTile,
-                [ComputeTile2, MemTile],
+            act1_fifo_names = ["act1_00_02_01", "act1_04_15_11", "act1_13_22_21"]
+            act1_fifos = {}
+
+            skip_fifo_names = ["skip_0", "skip_1", "skip_2"]
+            skip_fifos = {}
+
+            act1_fifos[act1_fifo_names[0]] = object_fifo(
+                act1_fifo_names[0],
+                shims[0],
+                [cores[0][0], mems[0]],
                 [2, 2, 4],
-                tensorLayer1In_ty,
+                laye1_act_sizes[0],
             )
-            of_skip_buf = object_fifo(
-                "skip_buf", MemTile, ComputeTile4, 2, tensorLayer1In_ty
+            skip_fifos[skip_fifo_names[0]] = object_fifo(
+                skip_fifo_names[0], mems[0], cores[0][2], 2, laye1_act_sizes[0]
             )
-            object_fifo_link(of_inOF_act_L3L2, of_skip_buf)
+            object_fifo_link(act1_fifo_names[0], skip_fifo_names[0])
+
+            for i in range(1, repeat + 1):
+                act1_fifos[act1_fifo_names[i]] = object_fifo(
+                    act1_fifo_names[i],
+                    cores[i - 1][2],
+                    [cores[i][0], mems[i - 1]],
+                    [2, 2, 4],
+                    laye1_act_sizes[i],
+                )
+                skip_fifos[skip_fifo_names[i]] = object_fifo(
+                    skip_fifo_names[i],
+                    mems[i - 1],
+                    cores[i][2],
+                    2,
+                    laye1_act_sizes[i],
+                )
+                object_fifo_link(act1_fifo_names[i], skip_fifo_names[i])
 
-            # weights
-            inOF_wts_0_L3L2 = object_fifo(
-                "inOF_wts_0_L3L2", ShimTile, MemTile, 1, allWeights_ty
-            )
-            of_wts_buf_00 = object_fifo(
-                "wts_buf_00", MemTile, ComputeTile2, 1, weightsLayer1_ty
-            )
-            wts_buf_01 = object_fifo(
-                "wts_buf_01",
-                MemTile,
-                [ComputeTile3, ComputeTile5],
-                1,
-                weightsLayer2_ty,
-            )
-            wts_buf_02 = object_fifo(
-                "wts_buf_02", MemTile, ComputeTile4, 1, weightsLayer3_ty
-            )
-            object_fifo_link(inOF_wts_0_L3L2, [of_wts_buf_00, wts_buf_01, wts_buf_02])
-
-            # activation tensor
-            of_act_2_3_5 = object_fifo(
-                "act_2_3_5",
-                ComputeTile2,
-                [ComputeTile3, ComputeTile5],
-                [2, 4, 4],
-                tensorLayer1Out_ty,
-            )  # 1x1 -> 3x3
-            act_3_4 = object_fifo(
-                "act_3_4", ComputeTile3, ComputeTile4, 2, tensorLayer2Out_ty
-            )  # 3x3 -> 1x1
-            act_5_4 = object_fifo(
-                "act_5_4", ComputeTile5, ComputeTile4, 2, tensorLayer2Out_ty
-            )  # 3x3 -> 1x1
+            act2_fifo_names = ["act2_02_03_05", "act2_15_12_14", "act2_22_23_25"]
+            act2_fifos = {}
+
+            act3_fifo_names_1 = ["act3_03_04", "act3_14_13", "act3_23_24"]
+            act3_fifo_1 = {}
+
+            act3_fifo_names_2 = ["act3_05_04", "act3_12_13", "act3_25_24"]
+            act3_fifo_2 = {}
+
+            for i in range(n_cols):
+                # 1x1 -> 3x3
+                act2_fifos[act2_fifo_names[i]] = object_fifo(
+                    act2_fifo_names[i],
+                    cores[i][0],
+                    [cores[i][1], cores[i][3]],
+                    2,
+                    tensorLayer1Out_ty,
+                )
+
+                # 3x3 -> 1x1
+                act3_fifo_1[act3_fifo_names_1[i]] = object_fifo(
+                    act3_fifo_names_1[i],
+                    cores[i][1],
+                    cores[i][2],
+                    2,
+                    tensorLayer2Out_ty,
+                )
+                # 3x3 -> 1x1
+                act3_fifo_2[act3_fifo_names_2[i]] = object_fifo(
+                    act3_fifo_names_2[i],
+                    cores[i][3],
+                    cores[i][2],
+                    2,
+                    tensorLayer2Out_ty,
+                )
 
             # output tensor
             outOFL2L3 = object_fifo(
-                "outOFL2L3", ComputeTile4, ShimTile, 2, tensorLayer3Out_ty
+                "outOFL2L3", cores[2][2], shims[2], 2, tensorLayer3Out_ty
             )
-
-            # 1x1 conv2d
-            @core(ComputeTile2, "conv2dk1.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-
-                    # acquire weights once
-                    element0Weights = of_wts_buf_00.acquire(ObjectFifoPort.Consume, 1)
-                    scale = memref.load(rtpComputeTile2, [0])
-                    for _ in range_(tensorInH):
-                        element0ActivactionsIn = of_inOF_act_L3L2.acquire(
-                            ObjectFifoPort.Consume, 1
+            conv3_out_fifo = [
+                act1_fifos[act1_fifo_names[1]],
+                act1_fifos[act1_fifo_names[2]],
+                outOFL2L3,
+            ]
+            conv3_out_fifo_names = ["act1_04_15_11", "act1_13_22_21", "outOFL2L3"]
+            # # 1x1 conv2d
+            for i in range(n_cols):
+
+                @core(cores[i][0], conv1_kernels[i])
+                def core_body():
+                    for _ in for_(sys.maxsize):
+
+                        # acquire weights once
+                        element0Weights = wts_sub_fifos[
+                            wts_sub_fifo_names[i][0]
+                        ].acquire(ObjectFifoPort.Consume, 1)
+                        scale = memref.load(rtp[i][0], [0])
+                        for _ in for_(tensorInH):
+                            element0ActivactionsIn = act1_fifos[
+                                act1_fifo_names[i]
+                            ].acquire(ObjectFifoPort.Consume, 1)
+                            element0ActivactionsOut = act2_fifos[
+                                act2_fifo_names[i]
+                            ].acquire(ObjectFifoPort.Produce, 1)
+                            res = call(
+                                conv1_kernels_call[i],
+                                [
+                                    element0ActivactionsIn,
+                                    element0Weights,
+                                    element0ActivactionsOut,
+                                    tensorInW,
+                                    tensorInCInit,
+                                    tensorInCInit,
+                                    scale,
+                                ],
+                            )
+
+                            objectfifo_release(
+                                ObjectFifoPort.Consume, act1_fifo_names[i], 1
+                            )
+
+                            objectfifo_release(
+                                ObjectFifoPort.Produce, act2_fifo_names[i], 1
+                            )
+                            yield_([])
+                        objectfifo_release(
+                            ObjectFifoPort.Consume, wts_sub_fifo_names[i][0], 1
                         )
-                        element0ActivactionsOut = of_act_2_3_5.acquire(
-                            ObjectFifoPort.Produce, 1
+                        yield_([])
+
+            # 3x3 conv2d OFM 0-31
+            for i in range(n_cols):
+
+                @core(cores[i][1], "conv2dk3.o")
+                def core_body():
+                    scale = 11
+                    for _ in for_(sys.maxsize):
+
+                        # acquire weights and rtps once
+                        element0Weights = wts_sub_fifos[
+                            wts_sub_fifo_names[i][1]
+                        ].acquire(ObjectFifoPort.Consume, 1)
+                        # scale = memref.load(rtpComputeTile03, 0)
+
+                        # pre-amble: top row
+                        elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire(
+                            ObjectFifoPort.Consume, 2
                         )
+                        element0ActivactionsOut = act3_fifo_1[
+                            act3_fifo_names_1[i]
+                        ].acquire(ObjectFifoPort.Produce, 1)
                         res = call(
-                            conv2dk1,
+                            conv2dk3,
                             [
-                                element0ActivactionsIn,
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[1],
                                 element0Weights,
                                 element0ActivactionsOut,
                                 tensorInW,
-                                tensorL1InC,
-                                tensorL1OutC,
+                                tensorInCInit,
+                                tensorInCInit,
+                                3,
+                                3,
+                                0,
                                 scale,
+                                0,
                             ],
                         )
-
-                        objectfifo_release(ObjectFifoPort.Consume, "inOF_act_L3L2", 1)
-
-                        objectfifo_release(ObjectFifoPort.Produce, "act_2_3_5", 1)
-                        yield_([])
-                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_00", 1)
-                    yield_([])
-
-            # 3x3 conv2d OFM 0-31
-            @core(ComputeTile3, "conv2dk3.o")
-            def core_body():
-                scale = 11
-                for _ in range_(sys.maxsize):
-
-                    # acquire weights and rtps once
-                    element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1)
-                    # scale = memref.load(rtpComputeTile3, 0)
-
-                    # pre-amble: top row
-                    elementActivactionsIn = of_act_2_3_5.acquire(
-                        ObjectFifoPort.Consume, 2
-                    )
-                    element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1)
-                    res = call(
-                        conv2dk3,
-                        [
-                            elementActivactionsIn[0],
-                            elementActivactionsIn[0],
-                            elementActivactionsIn[1],
-                            element0Weights,
-                            element0ActivactionsOut,
-                            tensorInW,
-                            tensorL2InC,
-                            tensorL2OutC,
-                            3,
-                            3,
-                            0,
-                            scale,
-                            0,
-                        ],
-                    )
-                    objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
-
-                    # middle
-                    for _ in range_(tensorInH - 2):
-                        elementActivactionsIn = of_act_2_3_5.acquire(
-                            ObjectFifoPort.Consume, 3
+                        objectfifo_release(
+                            ObjectFifoPort.Produce, act3_fifo_names_1[i], 1
                         )
-                        element0ActivactionsOut = act_3_4.acquire(
-                            ObjectFifoPort.Produce, 1
+
+                        # middle
+                        for _ in for_(tensorInH - 2):
+                            elementActivactionsIn = act2_fifos[
+                                act2_fifo_names[i]
+                            ].acquire(ObjectFifoPort.Consume, 3)
+                            element0ActivactionsOut = act3_fifo_1[
+                                act3_fifo_names_1[i]
+                            ].acquire(ObjectFifoPort.Produce, 1)
+                            res = call(
+                                conv2dk3,
+                                [
+                                    elementActivactionsIn[0],
+                                    elementActivactionsIn[1],
+                                    elementActivactionsIn[2],
+                                    element0Weights,
+                                    element0ActivactionsOut,
+                                    tensorInW,
+                                    tensorInCInit,
+                                    tensorInCInit,
+                                    3,
+                                    3,
+                                    1,
+                                    scale,
+                                    0,
+                                ],
+                            )
+
+                            objectfifo_release(
+                                ObjectFifoPort.Consume, act2_fifo_names[i], 1
+                            )
+                            objectfifo_release(
+                                ObjectFifoPort.Produce, act3_fifo_names_1[i], 1
+                            )
+                            yield_([])
+
+                        # last part
+                        elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire(
+                            ObjectFifoPort.Consume, 2
                         )
+                        element0ActivactionsOut = act3_fifo_1[
+                            act3_fifo_names_1[i]
+                        ].acquire(ObjectFifoPort.Produce, 1)
                         res = call(
                             conv2dk3,
                             [
                                 elementActivactionsIn[0],
                                 elementActivactionsIn[1],
-                                elementActivactionsIn[2],
+                                elementActivactionsIn[1],
                                 element0Weights,
                                 element0ActivactionsOut,
                                 tensorInW,
-                                tensorL2InC,
-                                tensorL2OutC,
+                                tensorInCInit,
+                                tensorInCInit,
                                 3,
                                 3,
-                                1,
+                                2,
                                 scale,
                                 0,
                             ],
                         )
 
-                        objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1)
-                        objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
-                        yield_([])
+                        objectfifo_release(
+                            ObjectFifoPort.Consume, act2_fifo_names[i], 2
+                        )
+                        objectfifo_release(
+                            ObjectFifoPort.Produce, act3_fifo_names_1[i], 1
+                        )
 
-                    # last part
-                    elementActivactionsIn = of_act_2_3_5.acquire(
-                        ObjectFifoPort.Consume, 2
-                    )
-                    element0ActivactionsOut = act_3_4.acquire(ObjectFifoPort.Produce, 1)
-                    res = call(
-                        conv2dk3,
-                        [
-                            elementActivactionsIn[0],
-                            elementActivactionsIn[1],
-                            elementActivactionsIn[1],
-                            element0Weights,
-                            element0ActivactionsOut,
-                            tensorInW,
-                            tensorL2InC,
-                            tensorL2OutC,
-                            3,
-                            3,
-                            2,
-                            scale,
-                            0,
-                        ],
-                    )
-
-                    objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2)
-                    objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1)
-
-                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1)
-                    yield_([])
+                        objectfifo_release(
+                            ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1
+                        )
+                        yield_([])
 
             # 3x3 conv2d OFM 32-63
-            @core(ComputeTile5, "conv2dk3.o")
-            def core_body():
-                scale = 11
-                for _ in range_(sys.maxsize):
-
-                    # acquire weights and rtps once
-                    element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1)
-                    # scale = memref.load(rtpComputeTile5, 0)
-
-                    # pre-amble: top row
-                    elementActivactionsIn = of_act_2_3_5.acquire(
-                        ObjectFifoPort.Consume, 2
-                    )
-                    element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1)
-                    res = call(
-                        conv2dk3,
-                        [
-                            elementActivactionsIn[0],
-                            elementActivactionsIn[0],
-                            elementActivactionsIn[1],
-                            element0Weights,
-                            element0ActivactionsOut,
-                            tensorInW,
-                            tensorL2InC,
-                            tensorL2OutC,
-                            3,
-                            3,
-                            0,
-                            scale,
-                            tensorL2OutC // 2,
-                        ],
-                    )
-
-                    objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
-
-                    # middle
-                    for _ in range_(tensorInH - 2):
-                        elementActivactionsIn = of_act_2_3_5.acquire(
-                            ObjectFifoPort.Consume, 3
-                        )
-                        element0ActivactionsOut = act_5_4.acquire(
-                            ObjectFifoPort.Produce, 1
+
+            for i in range(n_cols):
+
+                @core(cores[i][3], "conv2dk3.o")
+                def core_body():
+                    scale = 11
+                    for _ in for_(sys.maxsize):
+
+                        # acquire weights and rtps once
+                        element0Weights = wts_sub_fifos[
+                            wts_sub_fifo_names[i][1]
+                        ].acquire(ObjectFifoPort.Consume, 1)
+                        # scale = memref.load(rtpComputeTile05, 0)
+
+                        # pre-amble: top row
+                        elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire(
+                            ObjectFifoPort.Consume, 2
                         )
+                        element0ActivactionsOut = act3_fifo_2[
+                            act3_fifo_names_2[i]
+                        ].acquire(ObjectFifoPort.Produce, 1)
                         res = call(
                             conv2dk3,
                             [
+                                elementActivactionsIn[0],
                                 elementActivactionsIn[0],
                                 elementActivactionsIn[1],
-                                elementActivactionsIn[2],
                                 element0Weights,
                                 element0ActivactionsOut,
                                 tensorInW,
-                                tensorL2InC,
-                                tensorL2OutC,
+                                tensorInCInit,
+                                tensorInCInit,
                                 3,
                                 3,
-                                1,
+                                0,
                                 scale,
-                                tensorL2OutC // 2,
+                                tensorInCInit // 2,
                             ],
                         )
 
-                        objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1)
-                        objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
-                        yield_([])
-
-                    # last part
-                    elementActivactionsIn = of_act_2_3_5.acquire(
-                        ObjectFifoPort.Consume, 2
-                    )
-                    element0ActivactionsOut = act_5_4.acquire(ObjectFifoPort.Produce, 1)
-                    res = call(
-                        conv2dk3,
-                        [
-                            elementActivactionsIn[0],
-                            elementActivactionsIn[1],
-                            elementActivactionsIn[1],
-                            element0Weights,
-                            element0ActivactionsOut,
-                            tensorInW,
-                            tensorL2InC,
-                            tensorL2OutC,
-                            3,
-                            3,
-                            2,
-                            scale,
-                            tensorL2OutC // 2,
-                        ],
-                    )
-                    objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2)
-                    objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1)
-                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1)
-                    yield_([])
-
-            # # 1x1 conv2d and add skip
-            @core(ComputeTile4, "conv2dk1_skip.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-
-                    # acquire weights and rtps once
-                    element0Weights = wts_buf_02.acquire(ObjectFifoPort.Consume, 1)
-                    scale = memref.load(rtpComputeTile4, [0])
-                    skipScale = memref.load(rtpComputeTile4, [1])
-
-                    for _ in range_(tensorInH):
-                        element0ActivactionsIn = act_3_4.acquire(
-                            ObjectFifoPort.Consume, 1
-                        )
-                        element1ActivactionsIn = act_5_4.acquire(
-                            ObjectFifoPort.Consume, 1
-                        )
-                        elementSkipsIn = of_skip_buf.acquire(ObjectFifoPort.Consume, 1)
-                        elementActivactionsOut = outOFL2L3.acquire(
-                            ObjectFifoPort.Produce, 1
+                        objectfifo_release(
+                            ObjectFifoPort.Produce, act3_fifo_names_2[i], 1
                         )
 
-                        call(
-                            conv2dk1_skip,
+                        # middle
+                        for _ in for_(tensorInH - 2):
+                            elementActivactionsIn = act2_fifos[
+                                act2_fifo_names[i]
+                            ].acquire(ObjectFifoPort.Consume, 3)
+                            element0ActivactionsOut = act3_fifo_2[
+                                act3_fifo_names_2[i]
+                            ].acquire(ObjectFifoPort.Produce, 1)
+                            res = call(
+                                conv2dk3,
+                                [
+                                    elementActivactionsIn[0],
+                                    elementActivactionsIn[1],
+                                    elementActivactionsIn[2],
+                                    element0Weights,
+                                    element0ActivactionsOut,
+                                    tensorInW,
+                                    tensorInCInit,
+                                    tensorInCInit,
+                                    3,
+                                    3,
+                                    1,
+                                    scale,
+                                    tensorInCInit // 2,
+                                ],
+                            )
+
+                            objectfifo_release(
+                                ObjectFifoPort.Consume, act2_fifo_names[i], 1
+                            )
+                            objectfifo_release(
+                                ObjectFifoPort.Produce, act3_fifo_names_2[i], 1
+                            )
+                            yield_([])
+
+                        # last part
+                        elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire(
+                            ObjectFifoPort.Consume, 2
+                        )
+                        element0ActivactionsOut = act3_fifo_2[
+                            act3_fifo_names_2[i]
+                        ].acquire(ObjectFifoPort.Produce, 1)
+                        res = call(
+                            conv2dk3,
                             [
-                                element0ActivactionsIn,
-                                element1ActivactionsIn,
+                                elementActivactionsIn[0],
+                                elementActivactionsIn[1],
+                                elementActivactionsIn[1],
                                 element0Weights,
-                                elementActivactionsOut,
-                                elementSkipsIn,
+                                element0ActivactionsOut,
                                 tensorInW,
-                                tensorL3InC,
-                                tensorL3OutC,
+                                tensorInCInit,
+                                tensorInCInit,
+                                3,
+                                3,
+                                2,
                                 scale,
-                                skipScale,
+                                tensorInCInit // 2,
                             ],
                         )
-                        objectfifo_release(ObjectFifoPort.Produce, "outOFL2L3", 1)
-                        objectfifo_release(ObjectFifoPort.Consume, "act_3_4", 1)
-                        objectfifo_release(ObjectFifoPort.Consume, "act_5_4", 1)
-                        objectfifo_release(ObjectFifoPort.Consume, "skip_buf", 1)
+                        objectfifo_release(
+                            ObjectFifoPort.Consume, act2_fifo_names[i], 2
+                        )
+                        objectfifo_release(
+                            ObjectFifoPort.Produce, act3_fifo_names_2[i], 1
+                        )
+                        objectfifo_release(
+                            ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1
+                        )
+                        yield_([])
+
+            # # 1x1 conv2d and add skip
+            for i in range(n_cols):
+
+                @core(cores[i][2], conv3_kernels[i])
+                def core_body():
+                    for _ in for_(sys.maxsize):
+
+                        # acquire weights and rtps once
+                        element0Weights = wts_sub_fifos[
+                            wts_sub_fifo_names[i][2]
+                        ].acquire(ObjectFifoPort.Consume, 1)
+                        scale = memref.load(rtp[i][2], [0])
+                        skipScale = memref.load(rtp[i][2], [1])
+
+                        for _ in for_(tensorInH):
+                            element0ActivactionsIn = act3_fifo_1[
+                                act3_fifo_names_1[i]
+                            ].acquire(ObjectFifoPort.Consume, 1)
+                            element1ActivactionsIn = act3_fifo_2[
+                                act3_fifo_names_2[i]
+                            ].acquire(ObjectFifoPort.Consume, 1)
+
+                            elementActivactionsOut = conv3_out_fifo[i].acquire(
+                                ObjectFifoPort.Produce, 1
+                            )
+                            elementSkipsIn = skip_fifos[skip_fifo_names[i]].acquire(
+                                ObjectFifoPort.Consume, 1
+                            )
+                            call(
+                                conv3_kernels_call[i],
+                                [
+                                    element0ActivactionsIn,
+                                    element1ActivactionsIn,
+                                    element0Weights,
+                                    elementActivactionsOut,
+                                    elementSkipsIn,
+                                    tensorInW,
+                                    tensorInCInit,
+                                    tensorInCRest,
+                                    scale,
+                                    skipScale,
+                                ],
+                            )
+                            objectfifo_release(
+                                ObjectFifoPort.Consume, act3_fifo_names_1[i], 1
+                            )
+                            objectfifo_release(
+                                ObjectFifoPort.Consume, act3_fifo_names_2[i], 1
+                            )
+                            objectfifo_release(
+                                ObjectFifoPort.Produce, conv3_out_fifo_names[i], 1
+                            )
+
+                            objectfifo_release(
+                                ObjectFifoPort.Consume, skip_fifo_names[i], 1
+                            )
+                            yield_([])
+                        objectfifo_release(
+                            ObjectFifoPort.Consume, wts_sub_fifo_names[i][2], 1
+                        )
                         yield_([])
-                    objectfifo_release(ObjectFifoPort.Consume, "wts_buf_02", 1)
-                    yield_([])
 
             # instruction stream generation
-            activationsInSize32b = (tensorInW * tensorInH * tensorInC) // 4
-            acitivationsOutSize32b = activationsInSize32b
-            totalWeightsSize32b = (
-                tensorL1InC * tensorL1OutC
-                + 3 * 3 * tensorL2InC * tensorL2OutC
-                + tensorL3InC * tensorL3OutC
+            activationsInSize32b = (tensorInW * tensorInH * tensorInCInit) // 4
+            acitivationsOutSize32b = (tensorInW * tensorInH * tensorInCRest) // 4
+
+            totalWeightsSize32b_init = (
+                tensorInCInit * tensorInCInit
+                + 3 * 3 * tensorInCInit * tensorInCInit
+                + 2 * tensorInCInit * tensorInCRest
+            ) // 4
+
+            totalWeightsSize32b_rest = (
+                tensorInCInit * tensorInCRest
+                + 3 * 3 * tensorInCInit * tensorInCInit
+                + tensorInCInit * tensorInCRest
             ) // 4
 
+            totalWeightsSize32b_complete = (
+                totalWeightsSize32b_init + repeat * totalWeightsSize32b_rest
+            )
+
             activationsInL3_ty = MemRefType.get((activationsInSize32b,), int32_ty)
-            weightsInL3_ty = MemRefType.get((totalWeightsSize32b,), int32_ty)
+            activationsOutL3_ty = MemRefType.get((acitivationsOutSize32b,), int32_ty)
+            weightsInL3_ty_init = MemRefType.get((totalWeightsSize32b_init,), int32_ty)
+            weightsInL3_ty_rest = MemRefType.get((totalWeightsSize32b_rest,), int32_ty)
 
-            @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty)
+            weightsInL3_ty_complete = MemRefType.get(
+                (totalWeightsSize32b_complete,), int32_ty
+            )
+
+            @FuncOp.from_py_func(
+                activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty
+            )
             def sequence(inputFromL3, weightsFromL3, outputToL3):
 
-                if enableTrace:
-                    # Trace output
-
-                    # Trace_Event0, Trace_Event1: Select which events to trace.
-                    # Note that the event buffers only appear to be transferred to DDR in
-                    # bursts of 256 bytes. If less than 256 bytes are written, you may not
-                    # see trace output, or only see it on the next iteration of your
-                    # kernel invocation, as the buffer gets filled up. Note that, even
-                    # though events are encoded as 4 byte words, it may take more than 64
-                    # events to fill the buffer to 256 bytes and cause a flush, since
-                    # multiple repeating events can be 'compressed' by the trace mechanism.
-                    # In order to always generate sufficient events, we add the "assert
-                    # TRUE" event to one slot, which fires every cycle, and thus fills our
-                    # buffer quickly.
-
-                    # Some events:
-                    # TRUE                       (0x01)
-                    # STREAM_STALL               (0x18)
-                    # LOCK_STALL                 (0x1A)
-                    # EVENTS_CORE_INSTR_EVENT_1  (0x22)
-                    # EVENTS_CORE_INSTR_EVENT_0  (0x21)
-                    # INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
-                    # INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock .acquire instruction
-                    # INSTR_LOCK_.release_REQ     (0x2D)  Core executes a lock .release instruction
-                    # EVENTS_CORE_PORT_RUNNING_1 (0x4F)
-                    # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
-
-                    # Trace_Event0  (4 slots)
-                    ipu_write32(0, 4, 0x340E0, 0x4B222125)
-                    # Trace_Event1  (4 slots)
-                    ipu_write32(0, 4, 0x340E4, 0x2D2C1A4F)
-
-                    # Event slots as configured above:
-                    # 0: Kernel executes vector instruction
-                    # 1: Event 0 -- Kernel starts
-                    # 2: Event 1 -- Kernel done
-                    # 3: Port_Running_0
-                    # 4: Port_Running_1
-                    # 5: Lock Stall
-                    # 6: Lock .acquire Instr
-                    # 7: Lock .release Instr
-
-                    # Stream_Switch_Event_Port_Selection_0
-                    # This is necessary to capture the Port_Running_0 and Port_Running_1 events
-                    ipu_write32(0, 4, 0x3FF00, 0x121)
-
-                    # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
-                    ipu_write32(0, 4, 0x340D0, 0x10000)
-
-                    # Start trace copy out.
-                    ipu_writebd_shimtile(
-                        bd_id=3,
-                        buffer_length=trace_sz_in_i32s,
-                        buffer_offset=acitivationsOutSize32b,
-                        enable_packet=0,
-                        out_of_order_id=0,
-                        packet_id=0,
-                        packet_type=0,
-                        column=0,
-                        column_num=1,
-                        d0_stepsize=0,
-                        d0_wrap=0,
-                        d1_stepsize=0,
-                        d1_wrap=0,
-                        d2_stepsize=0,
-                        ddr_id=2,
-                        iteration_current=0,
-                        iteration_stepsize=0,
-                        iteration_wrap=0,
-                        lock_acq_enable=0,
-                        lock_acq_id=0,
-                        lock_acq_val=0,
-                        lock_rel_id=0,
-                        lock_rel_val=0,
-                        next_bd=0,
-                        use_next_bd=0,
-                        valid_bd=1,
-                    )
-                    ipu_write32(0, 2, 0x1D20C, 0x3)
-
-                # write RTP parameters
-                IpuWriteRTPOp(
-                    "rtpComputeTile2", col=0, row=2, index=0, value=1
-                )  # scale
-                IpuWriteRTPOp(
-                    "rtpComputeTile3", col=0, row=3, index=0, value=1
-                )  # scale
-                IpuWriteRTPOp(
-                    "rtpComputeTile5", col=0, row=5, index=0, value=1
-                )  # scale
-                IpuWriteRTPOp(
-                    "rtpComputeTile4", col=0, row=4, index=0, value=1
-                )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
-                IpuWriteRTPOp(
-                    "rtpComputeTile4", col=0, row=4, index=1, value=0
-                )  # skip_scale
+                for c, col in enumerate(rtp_name):
+                    for r, row in enumerate(col):
+                        IpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1)  # scale
+
+                IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=0)
+                IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=1)
+
+                IpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=0)
+
+                IpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=0)
+
+                # #     # write RTP parameters
+                # IpuWriteRTPOp(
+                #     "rtpComputeTile02", col=0, row=2, index=0, value=1
+                # )  # scale
+                # IpuWriteRTPOp(
+                #     "rtpComputeTile03", col=0, row=3, index=0, value=1
+                # )  # scale
+                # IpuWriteRTPOp(
+                #     "rtpComputeTile05", col=0, row=5, index=0, value=1
+                # )  # scale
+                # IpuWriteRTPOp(
+                #     "rtpComputeTile04", col=0, row=4, index=0, value=1
+                # )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
+                # IpuWriteRTPOp(
+                #     "rtpComputeTile04", col=0, row=4, index=1, value=0
+                # )  # skip_scale
 
                 ipu_dma_memcpy_nd(
-                    metadata="inOF_act_L3L2",
+                    metadata="act1_00_02_01",
                     bd_id=0,
                     mem=inputFromL3,
                     sizes=[1, 1, 1, activationsInSize32b],
@@ -625,15 +874,40 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, acitivationsOutSize32b],
                 )
                 ipu_dma_memcpy_nd(
-                    metadata="inOF_wts_0_L3L2",
+                    metadata="wts_0_L3L2",
+                    bd_id=1,
+                    mem=weightsFromL3,
+                    sizes=[1, 1, 1, totalWeightsSize32b_init],
+                )
+
+                ipu_dma_memcpy_nd(
+                    metadata="wts_1_L3L2",
+                    bd_id=1,
+                    mem=weightsFromL3,
+                    offsets=[0, 0, 0, totalWeightsSize32b_init],
+                    sizes=[1, 1, 1, totalWeightsSize32b_rest],
+                )
+
+                ipu_dma_memcpy_nd(
+                    metadata="wts_2_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
-                    sizes=[1, 1, 1, totalWeightsSize32b],
+                    offsets=[
+                        0,
+                        0,
+                        0,
+                        totalWeightsSize32b_init + totalWeightsSize32b_rest,
+                    ],
+                    sizes=[1, 1, 1, totalWeightsSize32b_rest],
                 )
 
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                ipu_sync(column=1, row=0, direction=0, channel=0)
 
-    print(ctx.module)
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
 
 
-bottleneck4AIEs()
+resnet_conv_x()

From 384650c5f036a0e7c7f3499d2140462a76aa0b8a Mon Sep 17 00:00:00 2001
From: Gagandeep Singh <gagan.posted@gmail.com>
Date: Mon, 22 Apr 2024 10:15:03 -0600
Subject: [PATCH 06/11] readme update

---
 programming_examples/ml/bottleneck/README.md  |  7 ----
 programming_examples/ml/conv2d/README.md      |  9 +----
 .../ml/conv2d_fused_relu/README.md            |  9 +----
 programming_examples/ml/resnet/README.md      |  8 ----
 programming_guide/section-6/README.md         | 39 +++++++++++++++++++
 5 files changed, 41 insertions(+), 31 deletions(-)
 create mode 100644 programming_guide/section-6/README.md

diff --git a/programming_examples/ml/bottleneck/README.md b/programming_examples/ml/bottleneck/README.md
index 144b8e36f2..40a69e8576 100644
--- a/programming_examples/ml/bottleneck/README.md
+++ b/programming_examples/ml/bottleneck/README.md
@@ -115,11 +115,4 @@ make
 To run the design:
 ```
 make run_py
-```
-
-### Prerequisites
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
 ```
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/README.md b/programming_examples/ml/conv2d/README.md
index 81b25f3e52..b2d93f066d 100644
--- a/programming_examples/ml/conv2d/README.md
+++ b/programming_examples/ml/conv2d/README.md
@@ -56,12 +56,5 @@ make
 
 To run the design:
 ```
-make run
-```
-
-### Prerequisites
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
+make run_py
 ```
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/README.md b/programming_examples/ml/conv2d_fused_relu/README.md
index 68e7e9b8cf..3f4a2264cd 100644
--- a/programming_examples/ml/conv2d_fused_relu/README.md
+++ b/programming_examples/ml/conv2d_fused_relu/README.md
@@ -88,12 +88,5 @@ make
 
 To run the design:
 ```
-make run
-```
-
-### Prerequisites
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
+make run_py
 ```
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/README.md b/programming_examples/ml/resnet/README.md
index 6382079c62..de4cc92535 100755
--- a/programming_examples/ml/resnet/README.md
+++ b/programming_examples/ml/resnet/README.md
@@ -107,14 +107,6 @@ To run the design:
 make run_py
 ```
 
-### Prerequisites
-
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
-```
-
 ## References
 <a id="1">[1]</a> 
 He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770-778).
diff --git a/programming_guide/section-6/README.md b/programming_guide/section-6/README.md
new file mode 100644
index 0000000000..f54c812ab3
--- /dev/null
+++ b/programming_guide/section-6/README.md
@@ -0,0 +1,39 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Section 6 - Larger Example Designs</ins>
+
+There are a number of example designs available [here](../../programming_examples/) which further help explain many of the unique features of AI Engines and the NPU array in Ryzen™ AI. This section contains more complex application designs for both vision and machine learning use cases. In particular we will describe a ResNet implementation on for Ryzen™ AI.
+
+## Vision Kernels
+
+| Design name | Data type | Description | 
+|-|-|-|
+| [Vision Passthrough](../../programming_examples/vision/vision_passthrough/) | i8 | A simple pipeline with just one `passThrough` kernel. This pipeline's main purpose is to test whether the data movement works correctly to copy a greyscale image. | 
+| [Color Detect](../../programming_examples/vision/color_detect/) | i32 | This multi-kernel, multi-core pipeline detects colors in an RGBA image.  | 
+| [Edge Detect](../../programming_examples/vision/edge_detect/) | i32 | A mult-kernel, multi-core pipeline that detects edges in an image and overlays the detection on the original image. | 
+| [Color Threshold](../../programming_examples/vision/color_threshold/) | i32 | A mult-core data-parallel implementation of color thresholding of a RGBA image. | 
+
+
+## Machine Learning Designs
+
+| Design name | Data type | Description | 
+|-|-|-|
+|[bottleneck](../../programming_examples/ml/bottleneck/)|ui8|A Bottleneck Residual Block is a variant of the residual block that utilises three convolutions, using 1x1, 3x3 and 1x1 filter sizes, respectively. The use of a bottleneck reduces the number of parameters and computations.|
+|[resnet](../../programming_examples/ml/resnet/)|ui8|ResNet with offloaded conv2_x bottleneck blocks. The implementation features kernel fusion and dataflow optimizations highlighting the unique architectural capabilties of AI Engines.|
+
+## Exercises
+
+1. In [bottlneck](../../programming_examples/ml/bottleneck/) design following a dataflow approach, how many elements does the 3x3 convolution operation require to proceed with its computation? <img src="../../mlir_tutorials/images/answer1.jpg" title="3. This allows for the necessary neighborhood information required by the convolutional kernel to be available for processing." height=25>
+2. Suppose you have a bottleneck block with input dimensions of 32x32x256. After passing through the 1x1 convolutional layer, the output dimensions become 32x32x64. What would be the output dimensions after the subsequent 3x3 convolutional layer, assuming a stride of 1 and no padding and output channel of 64? <img src="../../mlir_tutorials/images/answer1.jpg" title="30×30×64. Without padding, the spatial dimensions would shrink by two pixels in each dimension due to the 3x3 convolution operation." height=25>
+
+-----
+[[Prev - Section 5](../section-5/)] [[Top](..)]
+

From 16b1be502cab1ad60bde59a5af8367ca2ee7b2b9 Mon Sep 17 00:00:00 2001
From: Gagandeep Singh <gagan.posted@gmail.com>
Date: Mon, 22 Apr 2024 10:45:55 -0600
Subject: [PATCH 07/11] conv2d runtime fix

---
 programming_examples/ml/conv2d/Makefile |   2 +-
 programming_examples/ml/conv2d/run.lit  |   2 +-
 programming_examples/ml/conv2d/test.py  | 267 ++++++++++++------------
 3 files changed, 139 insertions(+), 132 deletions(-)

diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
index 0274f3fef7..0f4c925ed3 100755
--- a/programming_examples/ml/conv2d/Makefile
+++ b/programming_examples/ml/conv2d/Makefile
@@ -32,4 +32,4 @@ clean:
 		chess* *.o insts.txt \
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 run_py: 
-	${powershell} python3 test.py
\ No newline at end of file
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit
index 1eeef90b94..5220b6f5e4 100644
--- a/programming_examples/ml/conv2d/run.lit
+++ b/programming_examples/ml/conv2d/run.lit
@@ -1,4 +1,4 @@
-// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai, chess, torch
diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py
index 1dc847d8fe..9d8d08e763 100644
--- a/programming_examples/ml/conv2d/test.py
+++ b/programming_examples/ml/conv2d/test.py
@@ -14,136 +14,143 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
-
+import aie.utils.test as test_utils
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "conv2d"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("int8")
-
-shape_total_wts = (4096, 1)
-shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
-shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_out = (32, 8, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor)
-int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor)
-conv_scale = 7.6294e-06  # scale to convert int8 output to floating point
-int8_scale = 0.0078  # scale to convert int8 output to floating point
-min = -128
-max = 127
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class conv2d_int_model(nn.Module):
-    def __init__(self, in_planes=64, planes=64):
-        super(conv2d_int_model, self).__init__()
-        self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
-
-    def forward(self, x):
-        out_int = self.conv(x)
-        out_quant = out_int * conv_scale  # int8 x int8 leads to int32 output
-        out_float = int8_scale * torch.clamp(
-            torch.round(out_quant / int8_scale), min, max
-        )  # converting to int8 range
-        return out_float
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = conv2d_int_model()
-model.eval()
-model.conv.weight.data.copy_(int_weight)
-
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
-total_wts = np.concatenate((wts1), axis=None)
-total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 8, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(64, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=2 * int8_scale,
-)
-print("\nPASS!\n")
+def main(opts):
+    design = "conv2d"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("int8")
+
+    shape_total_wts = (4096, 1)
+    shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
+    shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_out = (32, 8, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor)
+    int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor)
+    conv_scale = 7.6294e-06  # scale to convert int8 output to floating point
+    int8_scale = 0.0078  # scale to convert int8 output to floating point
+    min = -128
+    max = 127
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class conv2d_int_model(nn.Module):
+        def __init__(self, in_planes=64, planes=64):
+            super(conv2d_int_model, self).__init__()
+            self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
+
+        def forward(self, x):
+            out_int = self.conv(x)
+            out_quant = out_int * conv_scale  # int8 x int8 leads to int32 output
+            out_float = int8_scale * torch.clamp(
+                torch.round(out_quant / int8_scale), min, max
+            )  # converting to int8 range
+            return out_float
+
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = conv2d_int_model()
+    model.eval()
+    model.conv.weight.data.copy_(int_weight)
+
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
+    total_wts = np.concatenate((wts1), axis=None)
+    total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 8, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(64, 32, 32)
+    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=2 * int8_scale,
+    )
+    print("\nPASS!\n")
+
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)

From 7f9441018937f5440943e075cf71588d52d5f1b6 Mon Sep 17 00:00:00 2001
From: Gagandeep Singh <gagan.posted@gmail.com>
Date: Mon, 22 Apr 2024 11:59:17 -0600
Subject: [PATCH 08/11] runtime argument fixes

---
 programming_examples/ml/bottleneck/Makefile   |   2 +-
 programming_examples/ml/bottleneck/test.py    | 348 ++++----
 .../ml/conv2d_fused_relu/Makefile             |   2 +-
 .../ml/conv2d_fused_relu/test.py              | 270 +++---
 .../ml/resnet/layers_conv2_x/Makefile         |   2 +-
 .../ml/resnet/layers_conv2_x/test.py          | 834 +++++++++---------
 6 files changed, 738 insertions(+), 720 deletions(-)

diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile
index f5c6e4561f..47ca6a78f7 100755
--- a/programming_examples/ml/bottleneck/Makefile
+++ b/programming_examples/ml/bottleneck/Makefile
@@ -37,4 +37,4 @@ clean:
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 
 run_py: 
-	${powershell} python3 test.py
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py
index 34f6347175..2613acbab2 100644
--- a/programming_examples/ml/bottleneck/test.py
+++ b/programming_examples/ml/bottleneck/test.py
@@ -14,177 +14,183 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
-
+import aie.utils.test as test_utils
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "bottleneck_int8"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("uint8")
-
-shape_in_act = (32, 32, 32, 8)
-shape_in_wts1 = (8, 32, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_in_wts2 = (8, 8, 3, 3, 8, 8)  # out,in,ky,kx,in8,out8
-shape_in_wts3 = (32, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_total_wts = (69632, 1)
-shape_out = (32, 32, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor)
-int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor)
-int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor)
-int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-inp_scale1 = 0.5
-inp_scale2 = 0.5
-inp_scale3 = 0.5
-inp_scale4 = 0.5
-
-weight_scale1 = 0.5
-weight_scale2 = 0.5
-weight_scale3 = 0.5
-
-combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2)
-combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3)
-combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1)
-combined_scale4 = -math.log2(inp_scale1 / inp_scale4)
-conv_scale = 0.0039  # scale to convert int8 output to floating point
-relu_scale = 0.0078  # scale to convert int8 output to floating point
-min = 0
-max = 255
-
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class bottleneck_int8(nn.Module):
-    def __init__(self, in_planes=256, planes=64):
-        super(bottleneck_int8, self).__init__()
-        self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False)
-        self.conv2 = nn.Conv2d(
-            64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False)
-
-        self.relu1 = nn.ReLU()
-        self.relu2 = nn.ReLU()
-        self.relu3 = nn.ReLU()
-
-    def forward(self, x):
-        conv1_out = self.conv1(x) * inp_scale1 * weight_scale1
-        relu1_out = torch.clamp(
-            torch.round(self.relu1(conv1_out) / inp_scale2), min, max
-        )  # convert to int and apply relu
-        conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2
-        relu2_out = torch.clamp(
-            torch.round(self.relu2(conv2_out) / inp_scale3), min, max
-        )
-        conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3
-        same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127)
-        skip_add = inp_scale1 * (same_scale_init + int_inp)
-        final_out = inp_scale4 * (
-            torch.clamp(torch.round(skip_add / inp_scale4), min, max)
-        )
-        return final_out
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = bottleneck_int8()
-model.eval()
-model.conv1.weight.data.copy_(int_weight1)
-model.conv2.weight.data.copy_(int_weight2)
-model.conv3.weight.data.copy_(int_weight3)
-
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
-wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
-wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
-
-total_wts = np.concatenate((wts1, wts2, wts3), axis=None)
-total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 32, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(256, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=inp_scale4,
-)
-
-print("\nPASS!\n")
+def main(opts):
+    design = "bottleneck_int8"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("uint8")
+
+    shape_in_act = (32, 32, 32, 8)
+    shape_in_wts1 = (8, 32, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_in_wts2 = (8, 8, 3, 3, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_in_wts3 = (32, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_total_wts = (69632, 1)
+    shape_out = (32, 32, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor)
+    int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor)
+    int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor)
+    int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+    inp_scale1 = 0.5
+    inp_scale2 = 0.5
+    inp_scale3 = 0.5
+    inp_scale4 = 0.5
+
+    weight_scale1 = 0.5
+    weight_scale2 = 0.5
+    weight_scale3 = 0.5
+
+    combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2)
+    combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3)
+    combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1)
+    combined_scale4 = -math.log2(inp_scale1 / inp_scale4)
+    conv_scale = 0.0039  # scale to convert int8 output to floating point
+    relu_scale = 0.0078  # scale to convert int8 output to floating point
+    min = 0
+    max = 255
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class bottleneck_int8(nn.Module):
+        def __init__(self, in_planes=256, planes=64):
+            super(bottleneck_int8, self).__init__()
+            self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False)
+            self.conv2 = nn.Conv2d(
+                64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+            )
+            self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False)
+
+            self.relu1 = nn.ReLU()
+            self.relu2 = nn.ReLU()
+            self.relu3 = nn.ReLU()
+
+        def forward(self, x):
+            conv1_out = self.conv1(x) * inp_scale1 * weight_scale1
+            relu1_out = torch.clamp(
+                torch.round(self.relu1(conv1_out) / inp_scale2), min, max
+            )  # convert to int and apply relu
+            conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2
+            relu2_out = torch.clamp(
+                torch.round(self.relu2(conv2_out) / inp_scale3), min, max
+            )
+            conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3
+            same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127)
+            skip_add = inp_scale1 * (same_scale_init + int_inp)
+            final_out = inp_scale4 * (
+                torch.clamp(torch.round(skip_add / inp_scale4), min, max)
+            )
+            return final_out
+
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = bottleneck_int8()
+    model.eval()
+    model.conv1.weight.data.copy_(int_weight1)
+    model.conv2.weight.data.copy_(int_weight2)
+    model.conv3.weight.data.copy_(int_weight3)
+
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+    wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+    wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+
+    total_wts = np.concatenate((wts1, wts2, wts3), axis=None)
+    total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 32, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(256, 32, 32)
+    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=inp_scale4,
+    )
+
+    print("\nPASS!\n")
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile
index 80cb34dc08..5911238a7a 100755
--- a/programming_examples/ml/conv2d_fused_relu/Makefile
+++ b/programming_examples/ml/conv2d_fused_relu/Makefile
@@ -32,4 +32,4 @@ clean:
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 
 run_py: 
-	${powershell} python3 test.py
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py
index 5bfe139112..05ea92d677 100644
--- a/programming_examples/ml/conv2d_fused_relu/test.py
+++ b/programming_examples/ml/conv2d_fused_relu/test.py
@@ -14,138 +14,144 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
-
+import aie.utils.test as test_utils
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "conv2d_with_relu"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("uint8")
-
-shape_total_wts = (4096, 1)
-shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
-shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_out = (32, 8, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor)
-int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor)
-conv_scale = 0.0039  # scale to convert int8 output to floating point
-relu_scale = 0.0078  # scale to convert int8 output to floating point
-min = 0
-max = 255
-
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class conv2d_relu_int_model(nn.Module):
-    def __init__(self, in_planes=64, planes=64):
-        super(conv2d_relu_int_model, self).__init__()
-        self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        out_int = self.conv(x)
-        out_float = out_int * conv_scale
-        out_int = self.relu(out_float)
-        out_float = relu_scale * torch.clamp(
-            torch.round(out_int / relu_scale), min, max
-        )  # converting to int to do proper clipping
-        return out_float
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = conv2d_relu_int_model()
-model.eval()
-model.conv.weight.data.copy_(int_weight)
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
-total_wts = np.concatenate((wts1), axis=None)
-total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 8, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(64, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=2 * relu_scale,
-)
-
-print("\nPASS!\n")
+def main(opts):
+    design = "conv2d_with_relu"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("uint8")
+
+    shape_total_wts = (4096, 1)
+    shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
+    shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_out = (32, 8, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor)
+    int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor)
+    conv_scale = 0.0039  # scale to convert int8 output to floating point
+    relu_scale = 0.0078  # scale to convert int8 output to floating point
+    min = 0
+    max = 255
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class conv2d_relu_int_model(nn.Module):
+        def __init__(self, in_planes=64, planes=64):
+            super(conv2d_relu_int_model, self).__init__()
+            self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
+            self.relu = nn.ReLU()
+
+        def forward(self, x):
+            out_int = self.conv(x)
+            out_float = out_int * conv_scale
+            out_int = self.relu(out_float)
+            out_float = relu_scale * torch.clamp(
+                torch.round(out_int / relu_scale), min, max
+            )  # converting to int to do proper clipping
+            return out_float
+
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = conv2d_relu_int_model()
+    model.eval()
+    model.conv.weight.data.copy_(int_weight)
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
+    total_wts = np.concatenate((wts1), axis=None)
+    total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 8, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(64, 32, 32)
+    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=2 * relu_scale,
+    )
+
+    print("\nPASS!\n")
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile
index d8f1b7261a..6218e61fb5 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/Makefile
+++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile
@@ -44,4 +44,4 @@ clean:
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 
 run_py: 
-	${powershell} python3 test.py
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py
index 02dc01b127..5784a4d30a 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/test.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/test.py
@@ -14,423 +14,429 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
-
+import aie.utils.test as test_utils
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "resnet_conv2_x_int8"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("uint8")
-
-shape_in_act = (32, 8, 32, 8)
-shape_total_wts = (212992, 1)
-shape_out = (32, 32, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor)
-block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor)
-block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor)
-block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
-block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor)
-block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor)
-block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor)
-block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor)
-block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-init_scale = 0.5
-block_0_relu_1 = 0.5
-block_0_relu_2 = 0.5
-block_0_relu_3 = 0.5
-
-block_0_weight_scale1 = 0.5
-block_0_weight_scale2 = 0.5
-block_0_weight_scale3 = 0.5
-block_0_weight_scale_skip = 0.5
-
-block_1_relu_1 = 0.5
-block_1_relu_2 = 0.5
-block_1_relu_3 = 0.5
-
-block_1_weight_scale1 = 0.5
-block_1_weight_scale2 = 0.5
-block_1_weight_scale3 = 0.5
-block_1_quant_add_1 = 0.5
-
-block_2_relu_1 = 0.5
-block_2_relu_2 = 0.5
-block_2_relu_3 = 0.5
-
-block_2_weight_scale1 = 0.5
-block_2_weight_scale2 = 0.5
-block_2_weight_scale3 = 0.5
-block_2_quant_add_1 = 0.5
-
-block_0_combined_scale1 = -math.log2(
-    init_scale * block_0_weight_scale1 / block_0_relu_1
-)  # RHS after first conv1x1 | clip 0-->255
-block_0_combined_scale2 = -math.log2(
-    block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
-)  # RHS after second conv3x3 | clip 0-->255
-block_0_combined_scale3 = -math.log2(
-    block_0_relu_2 * block_0_weight_scale3 / init_scale
-)  # RHS after third conv1x1 | clip -128-->+127
-block_0_combined_scale_skip = -math.log2(
-    init_scale * block_0_weight_scale_skip / init_scale
-)  # LHS after conv1x1 | clip -128-->+127
-block_0_combined_scale4 = -math.log2(
-    init_scale / block_0_relu_3
-)  # After addition | clip 0-->255
-
-block_1_combined_scale1 = -math.log2(
-    block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
-)  # RHS after first conv1x1 | clip 0-->255
-block_1_combined_scale2 = -math.log2(
-    block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
-)  # RHS after second conv3x3 | clip 0-->255
-block_1_combined_scale3 = -math.log2(
-    block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
-)  # RHS after third conv1x1 | clip -128-->+127
-block_1_combined_scale4 = -math.log2(
-    block_1_quant_add_1 / block_1_relu_3
-)  # After addition | clip 0-->255
-
-block_2_combined_scale1 = -math.log2(
-    block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
-)  # RHS after first conv1x1 | clip 0-->255
-block_2_combined_scale2 = -math.log2(
-    block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
-)  # RHS after second conv3x3 | clip 0-->255
-block_2_combined_scale3 = -math.log2(
-    block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
-)  # RHS after third conv1x1 | clip -128-->+127
-block_2_combined_scale4 = -math.log2(
-    block_2_quant_add_1 / block_2_relu_3
-)  # After addition | clip 0-->255
-
-min = 0
-max = 255
-
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class resnet_conv2_x_int8(nn.Module):
-    expansion = 4
-
-    def __init__(self, in_planes=64, planes=64):
-        super(resnet_conv2_x_int8, self).__init__()
-
-        self.shortcut = nn.Conv2d(
-            in_planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-        # Bottleneck 0
-        self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
-        self.block_0_conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.block_0_conv3 = nn.Conv2d(
-            planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-
-        self.block_0_relu1 = nn.ReLU()
-        self.block_0_relu2 = nn.ReLU()
-        self.block_0_relu3 = nn.ReLU()
-
-        # Bottleneck 1
-        self.block_1_conv1 = nn.Conv2d(
-            self.expansion * planes, planes, kernel_size=1, bias=False
-        )
-        self.block_1_conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.block_1_conv3 = nn.Conv2d(
-            planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-
-        self.block_1_relu1 = nn.ReLU()
-        self.block_1_relu2 = nn.ReLU()
-        self.block_1_relu3 = nn.ReLU()
-
-        # Bottleneck 2
-        self.block_2_conv1 = nn.Conv2d(
-            self.expansion * planes, planes, kernel_size=1, bias=False
-        )
-        self.block_2_conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.block_2_conv3 = nn.Conv2d(
-            planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-
-        self.block_2_relu1 = nn.ReLU()
-        self.block_2_relu2 = nn.ReLU()
-        self.block_2_relu3 = nn.ReLU()
-
-    def forward(self, x):
-        # **************** Bottleneck 0 ****************
-        block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1
-        block_0_relu1_out = torch.clamp(
-            torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1),
-            min,
-            max,
-        )  # convert to int and apply relu
-        block_0_conv2_out = (
-            self.block_0_conv2(block_0_relu1_out)
-            * block_0_relu_1
-            * block_0_weight_scale2
-        )
-        block_0_relu2_out = torch.clamp(
-            torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2),
-            min,
-            max,
-        )
-        block_0_conv3_out = (
-            self.block_0_conv3(block_0_relu2_out)
-            * block_0_relu_2
-            * block_0_weight_scale3
-        )
-        block_0_rhf_same_scale = torch.clamp(
-            torch.round(block_0_conv3_out / init_scale), -128, 127
-        )
-
-        block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip
-        block_0_lhs_same_scale = torch.clamp(
-            torch.round(block_0_lhs_conv / init_scale), -128, 127
-        )
-        # convert to int and apply relu
-
-        block_0_skip_add = init_scale * (
-            block_0_rhf_same_scale + block_0_lhs_same_scale
-        )
-        block_0_final_out = torch.clamp(
-            torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max
-        )
-        # **************** Bottleneck 1 ****************
-        block_1_conv1_out = (
-            self.block_1_conv1(block_0_final_out)
-            * block_0_relu_3
-            * block_1_weight_scale1
-        )
-        block_1_relu1_out = torch.clamp(
-            torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1),
-            min,
-            max,
-        )  # convert to int and apply relu
-        block_1_conv2_out = (
-            self.block_1_conv2(block_1_relu1_out)
-            * block_1_relu_1
-            * block_1_weight_scale2
-        )
-        block_1_relu2_out = torch.clamp(
-            torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2),
-            min,
-            max,
-        )
-        block_1_conv3_out = (
-            self.block_1_conv3(block_1_relu2_out)
-            * block_1_relu_2
-            * block_1_weight_scale3
-        )
-        block_1_rhf_same_scale = torch.clamp(
-            torch.round(block_1_conv3_out / block_0_relu_3), -128, 127
-        )
-
-        block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out)
-        block_1_final_out = torch.clamp(
-            torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max
-        )
-
-        # **************** Bottleneck 2 ****************
-        block_2_conv1_out = (
-            self.block_2_conv1(block_1_final_out)
-            * block_1_relu_3
-            * block_2_weight_scale1
-        )
-        block_2_relu1_out = torch.clamp(
-            torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1),
-            min,
-            max,
-        )  # convert to int and apply relu
-        block_2_conv2_out = (
-            self.block_2_conv2(block_2_relu1_out)
-            * block_2_relu_1
-            * block_2_weight_scale2
-        )
-        block_2_relu2_out = torch.clamp(
-            torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2),
-            min,
-            max,
-        )
-        block_2_conv3_out = (
-            self.block_2_conv3(block_2_relu2_out)
-            * block_2_relu_2
-            * block_2_weight_scale3
-        )
-        block_2_rhf_same_scale = torch.clamp(
-            torch.round(block_2_conv3_out / block_1_relu_3), -128, 127
-        )
-
-        block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out)
-        block_2_final_out = block_2_relu_3 * (
-            torch.clamp(
-                torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3),
+def main(opts):
+    design = "resnet_conv2_x_int8"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("uint8")
+
+    shape_in_act = (32, 8, 32, 8)
+    shape_total_wts = (212992, 1)
+    shape_out = (32, 32, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor)
+    block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor)
+    block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor)
+    block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
+    block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+    block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor)
+    block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor)
+    block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+    block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor)
+    block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor)
+    block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+    init_scale = 0.5
+    block_0_relu_1 = 0.5
+    block_0_relu_2 = 0.5
+    block_0_relu_3 = 0.5
+
+    block_0_weight_scale1 = 0.5
+    block_0_weight_scale2 = 0.5
+    block_0_weight_scale3 = 0.5
+    block_0_weight_scale_skip = 0.5
+
+    block_1_relu_1 = 0.5
+    block_1_relu_2 = 0.5
+    block_1_relu_3 = 0.5
+
+    block_1_weight_scale1 = 0.5
+    block_1_weight_scale2 = 0.5
+    block_1_weight_scale3 = 0.5
+    block_1_quant_add_1 = 0.5
+
+    block_2_relu_1 = 0.5
+    block_2_relu_2 = 0.5
+    block_2_relu_3 = 0.5
+
+    block_2_weight_scale1 = 0.5
+    block_2_weight_scale2 = 0.5
+    block_2_weight_scale3 = 0.5
+    block_2_quant_add_1 = 0.5
+
+    block_0_combined_scale1 = -math.log2(
+        init_scale * block_0_weight_scale1 / block_0_relu_1
+    )  # RHS after first conv1x1 | clip 0-->255
+    block_0_combined_scale2 = -math.log2(
+        block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
+    )  # RHS after second conv3x3 | clip 0-->255
+    block_0_combined_scale3 = -math.log2(
+        block_0_relu_2 * block_0_weight_scale3 / init_scale
+    )  # RHS after third conv1x1 | clip -128-->+127
+    block_0_combined_scale_skip = -math.log2(
+        init_scale * block_0_weight_scale_skip / init_scale
+    )  # LHS after conv1x1 | clip -128-->+127
+    block_0_combined_scale4 = -math.log2(
+        init_scale / block_0_relu_3
+    )  # After addition | clip 0-->255
+
+    block_1_combined_scale1 = -math.log2(
+        block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
+    )  # RHS after first conv1x1 | clip 0-->255
+    block_1_combined_scale2 = -math.log2(
+        block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
+    )  # RHS after second conv3x3 | clip 0-->255
+    block_1_combined_scale3 = -math.log2(
+        block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
+    )  # RHS after third conv1x1 | clip -128-->+127
+    block_1_combined_scale4 = -math.log2(
+        block_1_quant_add_1 / block_1_relu_3
+    )  # After addition | clip 0-->255
+
+    block_2_combined_scale1 = -math.log2(
+        block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
+    )  # RHS after first conv1x1 | clip 0-->255
+    block_2_combined_scale2 = -math.log2(
+        block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
+    )  # RHS after second conv3x3 | clip 0-->255
+    block_2_combined_scale3 = -math.log2(
+        block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
+    )  # RHS after third conv1x1 | clip -128-->+127
+    block_2_combined_scale4 = -math.log2(
+        block_2_quant_add_1 / block_2_relu_3
+    )  # After addition | clip 0-->255
+
+    min = 0
+    max = 255
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class resnet_conv2_x_int8(nn.Module):
+        expansion = 4
+
+        def __init__(self, in_planes=64, planes=64):
+            super(resnet_conv2_x_int8, self).__init__()
+
+            self.shortcut = nn.Conv2d(
+                in_planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+            # Bottleneck 0
+            self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+            self.block_0_conv2 = nn.Conv2d(
+                planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+            )
+            self.block_0_conv3 = nn.Conv2d(
+                planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+
+            self.block_0_relu1 = nn.ReLU()
+            self.block_0_relu2 = nn.ReLU()
+            self.block_0_relu3 = nn.ReLU()
+
+            # Bottleneck 1
+            self.block_1_conv1 = nn.Conv2d(
+                self.expansion * planes, planes, kernel_size=1, bias=False
+            )
+            self.block_1_conv2 = nn.Conv2d(
+                planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+            )
+            self.block_1_conv3 = nn.Conv2d(
+                planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+
+            self.block_1_relu1 = nn.ReLU()
+            self.block_1_relu2 = nn.ReLU()
+            self.block_1_relu3 = nn.ReLU()
+
+            # Bottleneck 2
+            self.block_2_conv1 = nn.Conv2d(
+                self.expansion * planes, planes, kernel_size=1, bias=False
+            )
+            self.block_2_conv2 = nn.Conv2d(
+                planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+            )
+            self.block_2_conv3 = nn.Conv2d(
+                planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+
+            self.block_2_relu1 = nn.ReLU()
+            self.block_2_relu2 = nn.ReLU()
+            self.block_2_relu3 = nn.ReLU()
+
+        def forward(self, x):
+            # **************** Bottleneck 0 ****************
+            block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1
+            block_0_relu1_out = torch.clamp(
+                torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1),
+                min,
+                max,
+            )  # convert to int and apply relu
+            block_0_conv2_out = (
+                self.block_0_conv2(block_0_relu1_out)
+                * block_0_relu_1
+                * block_0_weight_scale2
+            )
+            block_0_relu2_out = torch.clamp(
+                torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2),
                 min,
                 max,
             )
-        )
-        return block_2_final_out
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = resnet_conv2_x_int8()
-model.eval()
-model.block_0_conv1.weight.data.copy_(block_0_int_weight_1)
-model.block_0_conv2.weight.data.copy_(block_0_int_weight_2)
-model.block_0_conv3.weight.data.copy_(block_0_int_weight_3)
-model.shortcut.weight.data.copy_(block_0_int_weight_skip)
-
-model.block_1_conv1.weight.data.copy_(block_1_int_weight_1)
-model.block_1_conv2.weight.data.copy_(block_1_int_weight_2)
-model.block_1_conv3.weight.data.copy_(block_1_int_weight_3)
-
-model.block_2_conv1.weight.data.copy_(block_2_int_weight_1)
-model.block_2_conv2.weight.data.copy_(block_2_int_weight_2)
-model.block_2_conv3.weight.data.copy_(block_2_int_weight_3)
-
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-block0_wts1 = ds.reorder_mat(
-    block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block0_wts2 = ds.reorder_mat(
-    block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block0_wts3 = ds.reorder_mat(
-    block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block0_wts_skip = ds.reorder_mat(
-    block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-
-total_wts = np.concatenate(
-    (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
-)
-
-block1_wts1 = ds.reorder_mat(
-    block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block1_wts2 = ds.reorder_mat(
-    block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block1_wts3 = ds.reorder_mat(
-    block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-
-total_wts2 = np.concatenate(
-    (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
-)
-
-block2_wts1 = ds.reorder_mat(
-    block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block2_wts2 = ds.reorder_mat(
-    block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block2_wts3 = ds.reorder_mat(
-    block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-
-total_wts3 = np.concatenate(
-    (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
-)
-
-total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 32, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(256, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=block_2_relu_3,
-)
-
-print("\nPASS!\n")
+            block_0_conv3_out = (
+                self.block_0_conv3(block_0_relu2_out)
+                * block_0_relu_2
+                * block_0_weight_scale3
+            )
+            block_0_rhf_same_scale = torch.clamp(
+                torch.round(block_0_conv3_out / init_scale), -128, 127
+            )
+
+            block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip
+            block_0_lhs_same_scale = torch.clamp(
+                torch.round(block_0_lhs_conv / init_scale), -128, 127
+            )
+            # convert to int and apply relu
+
+            block_0_skip_add = init_scale * (
+                block_0_rhf_same_scale + block_0_lhs_same_scale
+            )
+            block_0_final_out = torch.clamp(
+                torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max
+            )
+            # **************** Bottleneck 1 ****************
+            block_1_conv1_out = (
+                self.block_1_conv1(block_0_final_out)
+                * block_0_relu_3
+                * block_1_weight_scale1
+            )
+            block_1_relu1_out = torch.clamp(
+                torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1),
+                min,
+                max,
+            )  # convert to int and apply relu
+            block_1_conv2_out = (
+                self.block_1_conv2(block_1_relu1_out)
+                * block_1_relu_1
+                * block_1_weight_scale2
+            )
+            block_1_relu2_out = torch.clamp(
+                torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2),
+                min,
+                max,
+            )
+            block_1_conv3_out = (
+                self.block_1_conv3(block_1_relu2_out)
+                * block_1_relu_2
+                * block_1_weight_scale3
+            )
+            block_1_rhf_same_scale = torch.clamp(
+                torch.round(block_1_conv3_out / block_0_relu_3), -128, 127
+            )
+
+            block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out)
+            block_1_final_out = torch.clamp(
+                torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max
+            )
+
+            # **************** Bottleneck 2 ****************
+            block_2_conv1_out = (
+                self.block_2_conv1(block_1_final_out)
+                * block_1_relu_3
+                * block_2_weight_scale1
+            )
+            block_2_relu1_out = torch.clamp(
+                torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1),
+                min,
+                max,
+            )  # convert to int and apply relu
+            block_2_conv2_out = (
+                self.block_2_conv2(block_2_relu1_out)
+                * block_2_relu_1
+                * block_2_weight_scale2
+            )
+            block_2_relu2_out = torch.clamp(
+                torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2),
+                min,
+                max,
+            )
+            block_2_conv3_out = (
+                self.block_2_conv3(block_2_relu2_out)
+                * block_2_relu_2
+                * block_2_weight_scale3
+            )
+            block_2_rhf_same_scale = torch.clamp(
+                torch.round(block_2_conv3_out / block_1_relu_3), -128, 127
+            )
+
+            block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out)
+            block_2_final_out = block_2_relu_3 * (
+                torch.clamp(
+                    torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3),
+                    min,
+                    max,
+                )
+            )
+            return block_2_final_out
+
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = resnet_conv2_x_int8()
+    model.eval()
+    model.block_0_conv1.weight.data.copy_(block_0_int_weight_1)
+    model.block_0_conv2.weight.data.copy_(block_0_int_weight_2)
+    model.block_0_conv3.weight.data.copy_(block_0_int_weight_3)
+    model.shortcut.weight.data.copy_(block_0_int_weight_skip)
+
+    model.block_1_conv1.weight.data.copy_(block_1_int_weight_1)
+    model.block_1_conv2.weight.data.copy_(block_1_int_weight_2)
+    model.block_1_conv3.weight.data.copy_(block_1_int_weight_3)
+
+    model.block_2_conv1.weight.data.copy_(block_2_int_weight_1)
+    model.block_2_conv2.weight.data.copy_(block_2_int_weight_2)
+    model.block_2_conv3.weight.data.copy_(block_2_int_weight_3)
+
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    block0_wts1 = ds.reorder_mat(
+        block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block0_wts2 = ds.reorder_mat(
+        block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block0_wts3 = ds.reorder_mat(
+        block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block0_wts_skip = ds.reorder_mat(
+        block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+
+    total_wts = np.concatenate(
+        (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
+    )
+
+    block1_wts1 = ds.reorder_mat(
+        block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block1_wts2 = ds.reorder_mat(
+        block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block1_wts3 = ds.reorder_mat(
+        block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+
+    total_wts2 = np.concatenate(
+        (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
+    )
+
+    block2_wts1 = ds.reorder_mat(
+        block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block2_wts2 = ds.reorder_mat(
+        block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block2_wts3 = ds.reorder_mat(
+        block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+
+    total_wts3 = np.concatenate(
+        (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
+    )
+
+    total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 32, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(256, 32, 32)
+    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=block_2_relu_3,
+    )
+
+    print("\nPASS!\n")
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)

From 223ed0aa4c5d17e3768ac13ccb153f84df444e32 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Mon, 22 Apr 2024 20:31:29 -0600
Subject: [PATCH 09/11] Black files

---
 programming_examples/ml/bottleneck/test.py    | 21 +++--
 programming_examples/ml/conv2d/test.py        | 16 ++--
 .../ml/conv2d_fused_relu/test.py              | 17 ++--
 .../ml/resnet/layers_conv2_x/test.py          | 82 ++++++++++++++-----
 4 files changed, 101 insertions(+), 35 deletions(-)

diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py
index 2613acbab2..48a9a8929c 100644
--- a/programming_examples/ml/bottleneck/test.py
+++ b/programming_examples/ml/bottleneck/test.py
@@ -15,9 +15,11 @@
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
+
 def main(opts):
     design = "bottleneck_int8"
     xclbin_path = opts.xclbin
@@ -90,7 +92,6 @@ def main(opts):
         trace_size=trace_size,
     )
 
-
     # ------------------------------------------------------
     # Define your golden reference
     # ------------------------------------------------------
@@ -117,14 +118,15 @@ def forward(self, x):
                 torch.round(self.relu2(conv2_out) / inp_scale3), min, max
             )
             conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3
-            same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127)
+            same_scale_init = torch.clamp(
+                torch.round(conv3_out / inp_scale1), -128, 127
+            )
             skip_add = inp_scale1 * (same_scale_init + int_inp)
             final_out = inp_scale4 * (
                 torch.clamp(torch.round(skip_add / inp_scale4), min, max)
             )
             return final_out
 
-
     # ------------------------------------------------------
     # Pytorch baseline
     # ------------------------------------------------------
@@ -141,7 +143,9 @@ def forward(self, x):
     # ------------------------------------------------------
     ds = DataShaper()
     before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
     ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
     ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
 
@@ -161,7 +165,9 @@ def forward(self, x):
         stop = time.time_ns()
 
         if enable_trace:
-            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
             write_out_trace(trace, trace_file)
 
         npu_time = stop - start
@@ -173,7 +179,9 @@ def forward(self, x):
     temp_out = aie_output.reshape(32, 32, 32, 8)
     temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
     ofm_mem_fmt = temp_out.reshape(256, 32, 32)
-    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
     ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
 
     # ------------------------------------------------------
@@ -190,6 +198,7 @@ def forward(self, x):
 
     print("\nPASS!\n")
 
+
 if __name__ == "__main__":
     p = test_utils.create_default_argparser()
     opts = p.parse_args(sys.argv[1:])
diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py
index 9d8d08e763..1a8d2e7712 100644
--- a/programming_examples/ml/conv2d/test.py
+++ b/programming_examples/ml/conv2d/test.py
@@ -15,9 +15,11 @@
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
+
 def main(opts):
     design = "conv2d"
     xclbin_path = opts.xclbin
@@ -71,7 +73,6 @@ def main(opts):
         trace_size=trace_size,
     )
 
-
     # ------------------------------------------------------
     # Define your golden reference
     # ------------------------------------------------------
@@ -88,7 +89,6 @@ def forward(self, x):
             )  # converting to int8 range
             return out_float
 
-
     # ------------------------------------------------------
     # Pytorch baseline
     # ------------------------------------------------------
@@ -103,7 +103,9 @@ def forward(self, x):
     # ------------------------------------------------------
     ds = DataShaper()
     before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
     ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
     ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
 
@@ -120,7 +122,9 @@ def forward(self, x):
         stop = time.time_ns()
 
         if enable_trace:
-            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
             write_out_trace(trace, trace_file)
 
         npu_time = stop - start
@@ -132,7 +136,9 @@ def forward(self, x):
     temp_out = aie_output.reshape(32, 8, 32, 8)
     temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
     ofm_mem_fmt = temp_out.reshape(64, 32, 32)
-    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
     ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
 
     # ------------------------------------------------------
diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py
index 05ea92d677..6fe407faaa 100644
--- a/programming_examples/ml/conv2d_fused_relu/test.py
+++ b/programming_examples/ml/conv2d_fused_relu/test.py
@@ -15,9 +15,11 @@
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
+
 def main(opts):
     design = "conv2d_with_relu"
     xclbin_path = opts.xclbin
@@ -72,7 +74,6 @@ def main(opts):
         trace_size=trace_size,
     )
 
-
     # ------------------------------------------------------
     # Define your golden reference
     # ------------------------------------------------------
@@ -91,7 +92,6 @@ def forward(self, x):
             )  # converting to int to do proper clipping
             return out_float
 
-
     # ------------------------------------------------------
     # Pytorch baseline
     # ------------------------------------------------------
@@ -105,7 +105,9 @@ def forward(self, x):
     # ------------------------------------------------------
     ds = DataShaper()
     before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
     ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
     ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
 
@@ -122,7 +124,9 @@ def forward(self, x):
         stop = time.time_ns()
 
         if enable_trace:
-            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
             write_out_trace(trace, trace_file)
 
         npu_time = stop - start
@@ -134,7 +138,9 @@ def forward(self, x):
     temp_out = aie_output.reshape(32, 8, 32, 8)
     temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
     ofm_mem_fmt = temp_out.reshape(64, 32, 32)
-    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
     ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
 
     # ------------------------------------------------------
@@ -151,6 +157,7 @@ def forward(self, x):
 
     print("\nPASS!\n")
 
+
 if __name__ == "__main__":
     p = test_utils.create_default_argparser()
     opts = p.parse_args(sys.argv[1:])
diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py
index 5784a4d30a..48b45b99ae 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/test.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/test.py
@@ -15,9 +15,11 @@
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
+
 def main(opts):
     design = "resnet_conv2_x_int8"
     xclbin_path = opts.xclbin
@@ -51,16 +53,28 @@ def main(opts):
     int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor)
     block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor)
     block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor)
-    block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
-    block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
+    block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
+    block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
 
-    block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor)
+    block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(
+        torch.FloatTensor
+    )
     block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor)
-    block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor)
+    block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
 
-    block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor)
+    block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(
+        torch.FloatTensor
+    )
     block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor)
-    block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor)
+    block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
 
     init_scale = 0.5
     block_0_relu_1 = 0.5
@@ -151,7 +165,6 @@ def main(opts):
         trace_size=trace_size,
     )
 
-
     # ------------------------------------------------------
     # Define your golden reference
     # ------------------------------------------------------
@@ -167,7 +180,12 @@ def __init__(self, in_planes=64, planes=64):
             # Bottleneck 0
             self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
             self.block_0_conv2 = nn.Conv2d(
-                planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+                bias=False,
             )
             self.block_0_conv3 = nn.Conv2d(
                 planes, self.expansion * planes, kernel_size=1, bias=False
@@ -182,7 +200,12 @@ def __init__(self, in_planes=64, planes=64):
                 self.expansion * planes, planes, kernel_size=1, bias=False
             )
             self.block_1_conv2 = nn.Conv2d(
-                planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+                bias=False,
             )
             self.block_1_conv3 = nn.Conv2d(
                 planes, self.expansion * planes, kernel_size=1, bias=False
@@ -197,7 +220,12 @@ def __init__(self, in_planes=64, planes=64):
                 self.expansion * planes, planes, kernel_size=1, bias=False
             )
             self.block_2_conv2 = nn.Conv2d(
-                planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+                bias=False,
             )
             self.block_2_conv3 = nn.Conv2d(
                 planes, self.expansion * planes, kernel_size=1, bias=False
@@ -209,7 +237,9 @@ def __init__(self, in_planes=64, planes=64):
 
         def forward(self, x):
             # **************** Bottleneck 0 ****************
-            block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1
+            block_0_conv1_out = (
+                self.block_0_conv1(x) * init_scale * block_0_weight_scale1
+            )
             block_0_relu1_out = torch.clamp(
                 torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1),
                 min,
@@ -244,7 +274,9 @@ def forward(self, x):
                 block_0_rhf_same_scale + block_0_lhs_same_scale
             )
             block_0_final_out = torch.clamp(
-                torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max
+                torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3),
+                min,
+                max,
             )
             # **************** Bottleneck 1 ****************
             block_1_conv1_out = (
@@ -276,9 +308,13 @@ def forward(self, x):
                 torch.round(block_1_conv3_out / block_0_relu_3), -128, 127
             )
 
-            block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out)
+            block_1_skip_add = block_0_relu_3 * (
+                block_1_rhf_same_scale + block_0_final_out
+            )
             block_1_final_out = torch.clamp(
-                torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max
+                torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3),
+                min,
+                max,
             )
 
             # **************** Bottleneck 2 ****************
@@ -311,7 +347,9 @@ def forward(self, x):
                 torch.round(block_2_conv3_out / block_1_relu_3), -128, 127
             )
 
-            block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out)
+            block_2_skip_add = block_1_relu_3 * (
+                block_2_rhf_same_scale + block_1_final_out
+            )
             block_2_final_out = block_2_relu_3 * (
                 torch.clamp(
                     torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3),
@@ -321,7 +359,6 @@ def forward(self, x):
             )
             return block_2_final_out
 
-
     # ------------------------------------------------------
     # Pytorch baseline
     # ------------------------------------------------------
@@ -347,7 +384,9 @@ def forward(self, x):
     # ------------------------------------------------------
     ds = DataShaper()
     before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-    before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
     ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
     ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
 
@@ -407,7 +446,9 @@ def forward(self, x):
         stop = time.time_ns()
 
         if enable_trace:
-            aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
             write_out_trace(trace, trace_file)
 
         npu_time = stop - start
@@ -419,7 +460,9 @@ def forward(self, x):
     temp_out = aie_output.reshape(32, 32, 32, 8)
     temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
     ofm_mem_fmt = temp_out.reshape(256, 32, 32)
-    ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
     ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
 
     # ------------------------------------------------------
@@ -436,6 +479,7 @@ def forward(self, x):
 
     print("\nPASS!\n")
 
+
 if __name__ == "__main__":
     p = test_utils.create_default_argparser()
     opts = p.parse_args(sys.argv[1:])

From 284b6114e9f10995430de7b1523aceb709f267f2 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Mon, 22 Apr 2024 20:33:03 -0600
Subject: [PATCH 10/11] Fix run.lit

---
 programming_examples/ml/bottleneck/run.lit            | 2 +-
 programming_examples/ml/conv2d/run.lit                | 2 +-
 programming_examples/ml/conv2d_fused_relu/run.lit     | 2 +-
 programming_examples/ml/resnet/layers_conv2_x/run.lit | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit
index ec30002c97..8a6024d66e 100644
--- a/programming_examples/ml/bottleneck/run.lit
+++ b/programming_examples/ml/bottleneck/run.lit
@@ -8,5 +8,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit
index 5220b6f5e4..349e45f9bc 100644
--- a/programming_examples/ml/conv2d/run.lit
+++ b/programming_examples/ml/conv2d/run.lit
@@ -6,5 +6,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8  -DINT8_ACT  -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit
index 0c122f451e..cfddde9013 100644
--- a/programming_examples/ml/conv2d_fused_relu/run.lit
+++ b/programming_examples/ml/conv2d_fused_relu/run.lit
@@ -6,5 +6,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit
index 61f43e45e6..c35a868772 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/run.lit
+++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit
@@ -10,5 +10,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file

From 377082bff2a87fcb986a8ef564d5e03b0fd4304b Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Mon, 22 Apr 2024 20:41:10 -0600
Subject: [PATCH 11/11] Fix resnet includes

---
 programming_examples/ml/resnet/layers_conv2_x/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 235b5c5308..f5243070d9 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -7,8 +7,8 @@
 
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
+from aie.dialects.scf import *
 from aie.extras.dialects.ext import memref, arith
-from aie.dialects.scf import for_, yield_
 from aie.extras.context import mlir_mod_ctx
 from aie.ir import MemRefType, TypeAttr