From f33b3caf27c86447ceb2fc3ccf91e385cf2398c4 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Mon, 22 Apr 2024 21:48:23 -0600
Subject: [PATCH] Add torch to lit cfg for programming_examples (#1370)

Co-authored-by: singagan <53442471+singagan@users.noreply.github.com>
Co-authored-by: Gagandeep Singh <gagan.posted@gmail.com>
---
 .github/workflows/buildAndTestRyzenAI.yml     |   1 +
 programming_examples/lit.cfg.py               |   8 +
 programming_examples/ml/bottleneck/Makefile   |   2 +-
 programming_examples/ml/bottleneck/README.md  |   7 -
 programming_examples/ml/bottleneck/run.lit    |   2 +-
 programming_examples/ml/bottleneck/test.py    | 355 +++----
 programming_examples/ml/conv2d/Makefile       |   2 +-
 programming_examples/ml/conv2d/README.md      |   9 +-
 programming_examples/ml/conv2d/run.lit        |   4 +-
 programming_examples/ml/conv2d/test.py        | 271 +++---
 .../ml/conv2d_fused_relu/Makefile             |   2 +-
 .../ml/conv2d_fused_relu/README.md            |   9 +-
 .../ml/conv2d_fused_relu/run.lit              |   2 +-
 .../ml/conv2d_fused_relu/test.py              | 275 +++---
 programming_examples/ml/resnet/README.md      |   8 -
 .../ml/resnet/layers_conv2_x/Makefile         |   2 +-
 .../ml/resnet/layers_conv2_x/aie2.py          |   2 +-
 .../ml/resnet/layers_conv2_x/run.lit          |   2 +-
 .../ml/resnet/layers_conv2_x/test.py          | 876 +++++++++---------
 programming_guide/section-6/README.md         |   8 +-
 20 files changed, 962 insertions(+), 885 deletions(-)

diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml
index acf2262fa2..bc3988e002 100644
--- a/.github/workflows/buildAndTestRyzenAI.yml
+++ b/.github/workflows/buildAndTestRyzenAI.yml
@@ -127,6 +127,7 @@ jobs:
           python -m venv aie-venv
           source aie-venv/bin/activate
           pip install -r python/requirements.txt
+          pip install -r python/requirements_ml.txt
           pip install jupyter
           sed -i.bak 's/OUTPUT_TIMEOUT = 10/OUTPUT_TIMEOUT = 100/g' \
             $(python -c 'import site; print(site.getsitepackages()[0])')/jupyter_client/runapp.py
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index d5ff22c85e..b28803cb43 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -104,6 +104,14 @@
     opencv_flags = ""
 config.substitutions.append(("%opencv_flags", opencv_flags))
 
+try:
+    import torch
+
+    config.available_features.add("torch")
+except ImportError:
+    print("torch not found", file=sys.stderr)
+    pass
+
 VitisSysrootFlag = ""
 if config.aieHostTarget == "x86_64":
     config.substitutions.append(("%aieHostTargetTriplet%", "x86_64-unknown-linux-gnu"))
diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile
index f5c6e4561f..47ca6a78f7 100755
--- a/programming_examples/ml/bottleneck/Makefile
+++ b/programming_examples/ml/bottleneck/Makefile
@@ -37,4 +37,4 @@ clean:
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 
 run_py: 
-	${powershell} python3 test.py
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/README.md b/programming_examples/ml/bottleneck/README.md
index 144b8e36f2..40a69e8576 100644
--- a/programming_examples/ml/bottleneck/README.md
+++ b/programming_examples/ml/bottleneck/README.md
@@ -115,11 +115,4 @@ make
 To run the design:
 ```
 make run_py
-```
-
-### Prerequisites
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
 ```
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit
index ec30002c97..8a6024d66e 100644
--- a/programming_examples/ml/bottleneck/run.lit
+++ b/programming_examples/ml/bottleneck/run.lit
@@ -8,5 +8,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py
index 34f6347175..48a9a8929c 100644
--- a/programming_examples/ml/bottleneck/test.py
+++ b/programming_examples/ml/bottleneck/test.py
@@ -14,177 +14,192 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+import aie.utils.test as test_utils
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "bottleneck_int8"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("uint8")
-
-shape_in_act = (32, 32, 32, 8)
-shape_in_wts1 = (8, 32, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_in_wts2 = (8, 8, 3, 3, 8, 8)  # out,in,ky,kx,in8,out8
-shape_in_wts3 = (32, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_total_wts = (69632, 1)
-shape_out = (32, 32, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor)
-int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor)
-int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor)
-int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-inp_scale1 = 0.5
-inp_scale2 = 0.5
-inp_scale3 = 0.5
-inp_scale4 = 0.5
-
-weight_scale1 = 0.5
-weight_scale2 = 0.5
-weight_scale3 = 0.5
-
-combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2)
-combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3)
-combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1)
-combined_scale4 = -math.log2(inp_scale1 / inp_scale4)
-conv_scale = 0.0039  # scale to convert int8 output to floating point
-relu_scale = 0.0078  # scale to convert int8 output to floating point
-min = 0
-max = 255
-
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class bottleneck_int8(nn.Module):
-    def __init__(self, in_planes=256, planes=64):
-        super(bottleneck_int8, self).__init__()
-        self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False)
-        self.conv2 = nn.Conv2d(
-            64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False)
-
-        self.relu1 = nn.ReLU()
-        self.relu2 = nn.ReLU()
-        self.relu3 = nn.ReLU()
-
-    def forward(self, x):
-        conv1_out = self.conv1(x) * inp_scale1 * weight_scale1
-        relu1_out = torch.clamp(
-            torch.round(self.relu1(conv1_out) / inp_scale2), min, max
-        )  # convert to int and apply relu
-        conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2
-        relu2_out = torch.clamp(
-            torch.round(self.relu2(conv2_out) / inp_scale3), min, max
-        )
-        conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3
-        same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127)
-        skip_add = inp_scale1 * (same_scale_init + int_inp)
-        final_out = inp_scale4 * (
-            torch.clamp(torch.round(skip_add / inp_scale4), min, max)
-        )
-        return final_out
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = bottleneck_int8()
-model.eval()
-model.conv1.weight.data.copy_(int_weight1)
-model.conv2.weight.data.copy_(int_weight2)
-model.conv3.weight.data.copy_(int_weight3)
-
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
-wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
-wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
-
-total_wts = np.concatenate((wts1, wts2, wts3), axis=None)
-total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 32, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(256, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=inp_scale4,
-)
-
-print("\nPASS!\n")
+
+def main(opts):
+    design = "bottleneck_int8"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("uint8")
+
+    shape_in_act = (32, 32, 32, 8)
+    shape_in_wts1 = (8, 32, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_in_wts2 = (8, 8, 3, 3, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_in_wts3 = (32, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_total_wts = (69632, 1)
+    shape_out = (32, 32, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor)
+    int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor)
+    int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor)
+    int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor)
+
+    inp_scale1 = 0.5
+    inp_scale2 = 0.5
+    inp_scale3 = 0.5
+    inp_scale4 = 0.5
+
+    weight_scale1 = 0.5
+    weight_scale2 = 0.5
+    weight_scale3 = 0.5
+
+    combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2)
+    combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3)
+    combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1)
+    combined_scale4 = -math.log2(inp_scale1 / inp_scale4)
+    conv_scale = 0.0039  # scale to convert int8 output to floating point
+    relu_scale = 0.0078  # scale to convert int8 output to floating point
+    min = 0
+    max = 255
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class bottleneck_int8(nn.Module):
+        def __init__(self, in_planes=256, planes=64):
+            super(bottleneck_int8, self).__init__()
+            self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False)
+            self.conv2 = nn.Conv2d(
+                64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False
+            )
+            self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False)
+
+            self.relu1 = nn.ReLU()
+            self.relu2 = nn.ReLU()
+            self.relu3 = nn.ReLU()
+
+        def forward(self, x):
+            conv1_out = self.conv1(x) * inp_scale1 * weight_scale1
+            relu1_out = torch.clamp(
+                torch.round(self.relu1(conv1_out) / inp_scale2), min, max
+            )  # convert to int and apply relu
+            conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2
+            relu2_out = torch.clamp(
+                torch.round(self.relu2(conv2_out) / inp_scale3), min, max
+            )
+            conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3
+            same_scale_init = torch.clamp(
+                torch.round(conv3_out / inp_scale1), -128, 127
+            )
+            skip_add = inp_scale1 * (same_scale_init + int_inp)
+            final_out = inp_scale4 * (
+                torch.clamp(torch.round(skip_add / inp_scale4), min, max)
+            )
+            return final_out
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = bottleneck_int8()
+    model.eval()
+    model.conv1.weight.data.copy_(int_weight1)
+    model.conv2.weight.data.copy_(int_weight2)
+    model.conv3.weight.data.copy_(int_weight3)
+
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+    wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+    wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX")
+
+    total_wts = np.concatenate((wts1, wts2, wts3), axis=None)
+    total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 32, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(256, 32, 32)
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=inp_scale4,
+    )
+
+    print("\nPASS!\n")
+
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
index 2bba6ea11c..0a89ce4bf0 100755
--- a/programming_examples/ml/conv2d/Makefile
+++ b/programming_examples/ml/conv2d/Makefile
@@ -34,4 +34,4 @@ clean:
 		chess* *.o insts.txt \
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 run_py: 
-	${powershell} python3 test.py
\ No newline at end of file
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/README.md b/programming_examples/ml/conv2d/README.md
index 81b25f3e52..b2d93f066d 100644
--- a/programming_examples/ml/conv2d/README.md
+++ b/programming_examples/ml/conv2d/README.md
@@ -56,12 +56,5 @@ make
 
 To run the design:
 ```
-make run
-```
-
-### Prerequisites
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
+make run_py
 ```
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit
index 1eeef90b94..349e45f9bc 100644
--- a/programming_examples/ml/conv2d/run.lit
+++ b/programming_examples/ml/conv2d/run.lit
@@ -1,4 +1,4 @@
-// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai, chess, torch
@@ -6,5 +6,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8  -DINT8_ACT  -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py
index 1dc847d8fe..1a8d2e7712 100644
--- a/programming_examples/ml/conv2d/test.py
+++ b/programming_examples/ml/conv2d/test.py
@@ -14,136 +14,149 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+import aie.utils.test as test_utils
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "conv2d"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("int8")
-
-shape_total_wts = (4096, 1)
-shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
-shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_out = (32, 8, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor)
-int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor)
-conv_scale = 7.6294e-06  # scale to convert int8 output to floating point
-int8_scale = 0.0078  # scale to convert int8 output to floating point
-min = -128
-max = 127
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class conv2d_int_model(nn.Module):
-    def __init__(self, in_planes=64, planes=64):
-        super(conv2d_int_model, self).__init__()
-        self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
-
-    def forward(self, x):
-        out_int = self.conv(x)
-        out_quant = out_int * conv_scale  # int8 x int8 leads to int32 output
-        out_float = int8_scale * torch.clamp(
-            torch.round(out_quant / int8_scale), min, max
-        )  # converting to int8 range
-        return out_float
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = conv2d_int_model()
-model.eval()
-model.conv.weight.data.copy_(int_weight)
-
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
-total_wts = np.concatenate((wts1), axis=None)
-total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 8, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(64, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=2 * int8_scale,
-)
-print("\nPASS!\n")
+
+def main(opts):
+    design = "conv2d"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("int8")
+
+    shape_total_wts = (4096, 1)
+    shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
+    shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_out = (32, 8, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor)
+    int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor)
+    conv_scale = 7.6294e-06  # scale to convert int8 output to floating point
+    int8_scale = 0.0078  # scale to convert int8 output to floating point
+    min = -128
+    max = 127
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class conv2d_int_model(nn.Module):
+        def __init__(self, in_planes=64, planes=64):
+            super(conv2d_int_model, self).__init__()
+            self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
+
+        def forward(self, x):
+            out_int = self.conv(x)
+            out_quant = out_int * conv_scale  # int8 x int8 leads to int32 output
+            out_float = int8_scale * torch.clamp(
+                torch.round(out_quant / int8_scale), min, max
+            )  # converting to int8 range
+            return out_float
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = conv2d_int_model()
+    model.eval()
+    model.conv.weight.data.copy_(int_weight)
+
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
+    total_wts = np.concatenate((wts1), axis=None)
+    total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 8, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(64, 32, 32)
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=2 * int8_scale,
+    )
+    print("\nPASS!\n")
+
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile
index f804bdd842..7c59ae4877 100755
--- a/programming_examples/ml/conv2d_fused_relu/Makefile
+++ b/programming_examples/ml/conv2d_fused_relu/Makefile
@@ -34,4 +34,4 @@ clean:
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 
 run_py: 
-	${powershell} python3 test.py
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
diff --git a/programming_examples/ml/conv2d_fused_relu/README.md b/programming_examples/ml/conv2d_fused_relu/README.md
index 68e7e9b8cf..3f4a2264cd 100644
--- a/programming_examples/ml/conv2d_fused_relu/README.md
+++ b/programming_examples/ml/conv2d_fused_relu/README.md
@@ -88,12 +88,5 @@ make
 
 To run the design:
 ```
-make run
-```
-
-### Prerequisites
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
+make run_py
 ```
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit
index 0c122f451e..cfddde9013 100644
--- a/programming_examples/ml/conv2d_fused_relu/run.lit
+++ b/programming_examples/ml/conv2d_fused_relu/run.lit
@@ -6,5 +6,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py
index 5bfe139112..6fe407faaa 100644
--- a/programming_examples/ml/conv2d_fused_relu/test.py
+++ b/programming_examples/ml/conv2d_fused_relu/test.py
@@ -14,138 +14,151 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+import aie.utils.test as test_utils
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "conv2d_with_relu"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("uint8")
-
-shape_total_wts = (4096, 1)
-shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
-shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
-shape_out = (32, 8, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor)
-int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor)
-conv_scale = 0.0039  # scale to convert int8 output to floating point
-relu_scale = 0.0078  # scale to convert int8 output to floating point
-min = 0
-max = 255
-
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class conv2d_relu_int_model(nn.Module):
-    def __init__(self, in_planes=64, planes=64):
-        super(conv2d_relu_int_model, self).__init__()
-        self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        out_int = self.conv(x)
-        out_float = out_int * conv_scale
-        out_int = self.relu(out_float)
-        out_float = relu_scale * torch.clamp(
-            torch.round(out_int / relu_scale), min, max
-        )  # converting to int to do proper clipping
-        return out_float
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = conv2d_relu_int_model()
-model.eval()
-model.conv.weight.data.copy_(int_weight)
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
-total_wts = np.concatenate((wts1), axis=None)
-total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 8, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(64, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=2 * relu_scale,
-)
-
-print("\nPASS!\n")
+
+def main(opts):
+    design = "conv2d_with_relu"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("uint8")
+
+    shape_total_wts = (4096, 1)
+    shape_in_act = (32, 8, 32, 8)  #'YCXC8' , 'CYX'
+    shape_in_wts1 = (8, 8, 1, 1, 8, 8)  # out,in,ky,kx,in8,out8
+    shape_out = (32, 8, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor)
+    int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor)
+    conv_scale = 0.0039  # scale to convert int8 output to floating point
+    relu_scale = 0.0078  # scale to convert int8 output to floating point
+    min = 0
+    max = 255
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class conv2d_relu_int_model(nn.Module):
+        def __init__(self, in_planes=64, planes=64):
+            super(conv2d_relu_int_model, self).__init__()
+            self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False)
+            self.relu = nn.ReLU()
+
+        def forward(self, x):
+            out_int = self.conv(x)
+            out_float = out_int * conv_scale
+            out_int = self.relu(out_float)
+            out_float = relu_scale * torch.clamp(
+                torch.round(out_int / relu_scale), min, max
+            )  # converting to int to do proper clipping
+            return out_float
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = conv2d_relu_int_model()
+    model.eval()
+    model.conv.weight.data.copy_(int_weight)
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX")
+    total_wts = np.concatenate((wts1), axis=None)
+    total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 8, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(64, 32, 32)
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=2 * relu_scale,
+    )
+
+    print("\nPASS!\n")
+
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_examples/ml/resnet/README.md b/programming_examples/ml/resnet/README.md
index 6382079c62..de4cc92535 100755
--- a/programming_examples/ml/resnet/README.md
+++ b/programming_examples/ml/resnet/README.md
@@ -107,14 +107,6 @@ To run the design:
 make run_py
 ```
 
-### Prerequisites
-
-To install the dependencies, run the following command:
-```
-pip install -r requirements.txt
-
-```
-
 ## References
 <a id="1">[1]</a> 
 He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770-778).
diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile
index d8f1b7261a..6218e61fb5 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/Makefile
+++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile
@@ -44,4 +44,4 @@ clean:
 		*.log aie_partition.json *.bin BOOT.BIN _x test.exe
 
 run_py: 
-	${powershell} python3 test.py
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 235b5c5308..f5243070d9 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -7,8 +7,8 @@
 
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
+from aie.dialects.scf import *
 from aie.extras.dialects.ext import memref, arith
-from aie.dialects.scf import for_, yield_
 from aie.extras.context import mlir_mod_ctx
 from aie.ir import MemRefType, TypeAttr
 
diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit
index 61f43e45e6..c35a868772 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/run.lit
+++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit
@@ -10,5 +10,5 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py | FileCheck %s
+// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py
index 02dc01b127..48b45b99ae 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/test.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/test.py
@@ -14,423 +14,473 @@
 import os
 import numpy as np
 from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
+import aie.utils.test as test_utils
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
 
-design = "resnet_conv2_x_int8"
-xclbin_path = os.path.abspath("build/final.xclbin")
-insts_path = os.path.abspath("build/insts.txt")
-
-log_folder = "log/"
-if not os.path.exists(log_folder):
-    os.makedirs(log_folder)
-
-num_iter = 1
-npu_time_total = 0
-npu_time_min = 9999999
-npu_time_max = 0
-trace_size = 16384
-enable_trace = False
-trace_file = "log/trace_" + design + ".txt"
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-dtype_in = np.dtype("int8")
-dtype_wts = np.dtype("int8")
-dtype_out = np.dtype("uint8")
-
-shape_in_act = (32, 8, 32, 8)
-shape_total_wts = (212992, 1)
-shape_out = (32, 32, 32, 8)
-
-# ------------------------------------------------------
-# Initialize activation, weights, scaling factor for int8 model
-# ------------------------------------------------------
-int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor)
-block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor)
-block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor)
-block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
-block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor)
-block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor)
-block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor)
-block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor)
-block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor)
-
-init_scale = 0.5
-block_0_relu_1 = 0.5
-block_0_relu_2 = 0.5
-block_0_relu_3 = 0.5
-
-block_0_weight_scale1 = 0.5
-block_0_weight_scale2 = 0.5
-block_0_weight_scale3 = 0.5
-block_0_weight_scale_skip = 0.5
-
-block_1_relu_1 = 0.5
-block_1_relu_2 = 0.5
-block_1_relu_3 = 0.5
-
-block_1_weight_scale1 = 0.5
-block_1_weight_scale2 = 0.5
-block_1_weight_scale3 = 0.5
-block_1_quant_add_1 = 0.5
-
-block_2_relu_1 = 0.5
-block_2_relu_2 = 0.5
-block_2_relu_3 = 0.5
-
-block_2_weight_scale1 = 0.5
-block_2_weight_scale2 = 0.5
-block_2_weight_scale3 = 0.5
-block_2_quant_add_1 = 0.5
-
-block_0_combined_scale1 = -math.log2(
-    init_scale * block_0_weight_scale1 / block_0_relu_1
-)  # RHS after first conv1x1 | clip 0-->255
-block_0_combined_scale2 = -math.log2(
-    block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
-)  # RHS after second conv3x3 | clip 0-->255
-block_0_combined_scale3 = -math.log2(
-    block_0_relu_2 * block_0_weight_scale3 / init_scale
-)  # RHS after third conv1x1 | clip -128-->+127
-block_0_combined_scale_skip = -math.log2(
-    init_scale * block_0_weight_scale_skip / init_scale
-)  # LHS after conv1x1 | clip -128-->+127
-block_0_combined_scale4 = -math.log2(
-    init_scale / block_0_relu_3
-)  # After addition | clip 0-->255
-
-block_1_combined_scale1 = -math.log2(
-    block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
-)  # RHS after first conv1x1 | clip 0-->255
-block_1_combined_scale2 = -math.log2(
-    block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
-)  # RHS after second conv3x3 | clip 0-->255
-block_1_combined_scale3 = -math.log2(
-    block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
-)  # RHS after third conv1x1 | clip -128-->+127
-block_1_combined_scale4 = -math.log2(
-    block_1_quant_add_1 / block_1_relu_3
-)  # After addition | clip 0-->255
-
-block_2_combined_scale1 = -math.log2(
-    block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
-)  # RHS after first conv1x1 | clip 0-->255
-block_2_combined_scale2 = -math.log2(
-    block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
-)  # RHS after second conv3x3 | clip 0-->255
-block_2_combined_scale3 = -math.log2(
-    block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
-)  # RHS after third conv1x1 | clip -128-->+127
-block_2_combined_scale4 = -math.log2(
-    block_2_quant_add_1 / block_2_relu_3
-)  # After addition | clip 0-->255
-
-min = 0
-max = 255
-
-# ------------------------------------------------------
-# Get device, load the xclbin & kernel and register them
-# ------------------------------------------------------
-app = setup_aie(
-    xclbin_path,
-    insts_path,
-    shape_in_act,
-    dtype_in,
-    shape_total_wts,
-    dtype_wts,
-    shape_out,
-    dtype_out,
-    enable_trace=enable_trace,
-    trace_size=trace_size,
-)
-
-
-# ------------------------------------------------------
-# Define your golden reference
-# ------------------------------------------------------
-class resnet_conv2_x_int8(nn.Module):
-    expansion = 4
-
-    def __init__(self, in_planes=64, planes=64):
-        super(resnet_conv2_x_int8, self).__init__()
-
-        self.shortcut = nn.Conv2d(
-            in_planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-        # Bottleneck 0
-        self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
-        self.block_0_conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.block_0_conv3 = nn.Conv2d(
-            planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-
-        self.block_0_relu1 = nn.ReLU()
-        self.block_0_relu2 = nn.ReLU()
-        self.block_0_relu3 = nn.ReLU()
-
-        # Bottleneck 1
-        self.block_1_conv1 = nn.Conv2d(
-            self.expansion * planes, planes, kernel_size=1, bias=False
-        )
-        self.block_1_conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.block_1_conv3 = nn.Conv2d(
-            planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-
-        self.block_1_relu1 = nn.ReLU()
-        self.block_1_relu2 = nn.ReLU()
-        self.block_1_relu3 = nn.ReLU()
-
-        # Bottleneck 2
-        self.block_2_conv1 = nn.Conv2d(
-            self.expansion * planes, planes, kernel_size=1, bias=False
-        )
-        self.block_2_conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False
-        )
-        self.block_2_conv3 = nn.Conv2d(
-            planes, self.expansion * planes, kernel_size=1, bias=False
-        )
-
-        self.block_2_relu1 = nn.ReLU()
-        self.block_2_relu2 = nn.ReLU()
-        self.block_2_relu3 = nn.ReLU()
-
-    def forward(self, x):
-        # **************** Bottleneck 0 ****************
-        block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1
-        block_0_relu1_out = torch.clamp(
-            torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1),
-            min,
-            max,
-        )  # convert to int and apply relu
-        block_0_conv2_out = (
-            self.block_0_conv2(block_0_relu1_out)
-            * block_0_relu_1
-            * block_0_weight_scale2
-        )
-        block_0_relu2_out = torch.clamp(
-            torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2),
-            min,
-            max,
-        )
-        block_0_conv3_out = (
-            self.block_0_conv3(block_0_relu2_out)
-            * block_0_relu_2
-            * block_0_weight_scale3
-        )
-        block_0_rhf_same_scale = torch.clamp(
-            torch.round(block_0_conv3_out / init_scale), -128, 127
-        )
-
-        block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip
-        block_0_lhs_same_scale = torch.clamp(
-            torch.round(block_0_lhs_conv / init_scale), -128, 127
-        )
-        # convert to int and apply relu
-
-        block_0_skip_add = init_scale * (
-            block_0_rhf_same_scale + block_0_lhs_same_scale
-        )
-        block_0_final_out = torch.clamp(
-            torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max
-        )
-        # **************** Bottleneck 1 ****************
-        block_1_conv1_out = (
-            self.block_1_conv1(block_0_final_out)
-            * block_0_relu_3
-            * block_1_weight_scale1
-        )
-        block_1_relu1_out = torch.clamp(
-            torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1),
-            min,
-            max,
-        )  # convert to int and apply relu
-        block_1_conv2_out = (
-            self.block_1_conv2(block_1_relu1_out)
-            * block_1_relu_1
-            * block_1_weight_scale2
-        )
-        block_1_relu2_out = torch.clamp(
-            torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2),
-            min,
-            max,
-        )
-        block_1_conv3_out = (
-            self.block_1_conv3(block_1_relu2_out)
-            * block_1_relu_2
-            * block_1_weight_scale3
-        )
-        block_1_rhf_same_scale = torch.clamp(
-            torch.round(block_1_conv3_out / block_0_relu_3), -128, 127
-        )
-
-        block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out)
-        block_1_final_out = torch.clamp(
-            torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max
-        )
-
-        # **************** Bottleneck 2 ****************
-        block_2_conv1_out = (
-            self.block_2_conv1(block_1_final_out)
-            * block_1_relu_3
-            * block_2_weight_scale1
-        )
-        block_2_relu1_out = torch.clamp(
-            torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1),
-            min,
-            max,
-        )  # convert to int and apply relu
-        block_2_conv2_out = (
-            self.block_2_conv2(block_2_relu1_out)
-            * block_2_relu_1
-            * block_2_weight_scale2
-        )
-        block_2_relu2_out = torch.clamp(
-            torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2),
-            min,
-            max,
-        )
-        block_2_conv3_out = (
-            self.block_2_conv3(block_2_relu2_out)
-            * block_2_relu_2
-            * block_2_weight_scale3
-        )
-        block_2_rhf_same_scale = torch.clamp(
-            torch.round(block_2_conv3_out / block_1_relu_3), -128, 127
-        )
-
-        block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out)
-        block_2_final_out = block_2_relu_3 * (
-            torch.clamp(
-                torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3),
+
+def main(opts):
+    design = "resnet_conv2_x_int8"
+    xclbin_path = opts.xclbin
+    insts_path = opts.instr
+
+    log_folder = "log/"
+    if not os.path.exists(log_folder):
+        os.makedirs(log_folder)
+
+    num_iter = 1
+    npu_time_total = 0
+    npu_time_min = 9999999
+    npu_time_max = 0
+    trace_size = 16384
+    enable_trace = False
+    trace_file = "log/trace_" + design + ".txt"
+    # ------------------------------------------------------
+    # Configure this to match your design's buffer size
+    # ------------------------------------------------------
+    dtype_in = np.dtype("int8")
+    dtype_wts = np.dtype("int8")
+    dtype_out = np.dtype("uint8")
+
+    shape_in_act = (32, 8, 32, 8)
+    shape_total_wts = (212992, 1)
+    shape_out = (32, 32, 32, 8)
+
+    # ------------------------------------------------------
+    # Initialize activation, weights, scaling factor for int8 model
+    # ------------------------------------------------------
+    int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor)
+    block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor)
+    block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor)
+    block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
+    block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
+
+    block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(
+        torch.FloatTensor
+    )
+    block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor)
+    block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
+
+    block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(
+        torch.FloatTensor
+    )
+    block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor)
+    block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(
+        torch.FloatTensor
+    )
+
+    init_scale = 0.5
+    block_0_relu_1 = 0.5
+    block_0_relu_2 = 0.5
+    block_0_relu_3 = 0.5
+
+    block_0_weight_scale1 = 0.5
+    block_0_weight_scale2 = 0.5
+    block_0_weight_scale3 = 0.5
+    block_0_weight_scale_skip = 0.5
+
+    block_1_relu_1 = 0.5
+    block_1_relu_2 = 0.5
+    block_1_relu_3 = 0.5
+
+    block_1_weight_scale1 = 0.5
+    block_1_weight_scale2 = 0.5
+    block_1_weight_scale3 = 0.5
+    block_1_quant_add_1 = 0.5
+
+    block_2_relu_1 = 0.5
+    block_2_relu_2 = 0.5
+    block_2_relu_3 = 0.5
+
+    block_2_weight_scale1 = 0.5
+    block_2_weight_scale2 = 0.5
+    block_2_weight_scale3 = 0.5
+    block_2_quant_add_1 = 0.5
+
+    block_0_combined_scale1 = -math.log2(
+        init_scale * block_0_weight_scale1 / block_0_relu_1
+    )  # RHS after first conv1x1 | clip 0-->255
+    block_0_combined_scale2 = -math.log2(
+        block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2
+    )  # RHS after second conv3x3 | clip 0-->255
+    block_0_combined_scale3 = -math.log2(
+        block_0_relu_2 * block_0_weight_scale3 / init_scale
+    )  # RHS after third conv1x1 | clip -128-->+127
+    block_0_combined_scale_skip = -math.log2(
+        init_scale * block_0_weight_scale_skip / init_scale
+    )  # LHS after conv1x1 | clip -128-->+127
+    block_0_combined_scale4 = -math.log2(
+        init_scale / block_0_relu_3
+    )  # After addition | clip 0-->255
+
+    block_1_combined_scale1 = -math.log2(
+        block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1
+    )  # RHS after first conv1x1 | clip 0-->255
+    block_1_combined_scale2 = -math.log2(
+        block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2
+    )  # RHS after second conv3x3 | clip 0-->255
+    block_1_combined_scale3 = -math.log2(
+        block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1
+    )  # RHS after third conv1x1 | clip -128-->+127
+    block_1_combined_scale4 = -math.log2(
+        block_1_quant_add_1 / block_1_relu_3
+    )  # After addition | clip 0-->255
+
+    block_2_combined_scale1 = -math.log2(
+        block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1
+    )  # RHS after first conv1x1 | clip 0-->255
+    block_2_combined_scale2 = -math.log2(
+        block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2
+    )  # RHS after second conv3x3 | clip 0-->255
+    block_2_combined_scale3 = -math.log2(
+        block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1
+    )  # RHS after third conv1x1 | clip -128-->+127
+    block_2_combined_scale4 = -math.log2(
+        block_2_quant_add_1 / block_2_relu_3
+    )  # After addition | clip 0-->255
+
+    min = 0
+    max = 255
+
+    # ------------------------------------------------------
+    # Get device, load the xclbin & kernel and register them
+    # ------------------------------------------------------
+    app = setup_aie(
+        xclbin_path,
+        insts_path,
+        shape_in_act,
+        dtype_in,
+        shape_total_wts,
+        dtype_wts,
+        shape_out,
+        dtype_out,
+        enable_trace=enable_trace,
+        trace_size=trace_size,
+    )
+
+    # ------------------------------------------------------
+    # Define your golden reference
+    # ------------------------------------------------------
+    class resnet_conv2_x_int8(nn.Module):
+        expansion = 4
+
+        def __init__(self, in_planes=64, planes=64):
+            super(resnet_conv2_x_int8, self).__init__()
+
+            self.shortcut = nn.Conv2d(
+                in_planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+            # Bottleneck 0
+            self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+            self.block_0_conv2 = nn.Conv2d(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+                bias=False,
+            )
+            self.block_0_conv3 = nn.Conv2d(
+                planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+
+            self.block_0_relu1 = nn.ReLU()
+            self.block_0_relu2 = nn.ReLU()
+            self.block_0_relu3 = nn.ReLU()
+
+            # Bottleneck 1
+            self.block_1_conv1 = nn.Conv2d(
+                self.expansion * planes, planes, kernel_size=1, bias=False
+            )
+            self.block_1_conv2 = nn.Conv2d(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+                bias=False,
+            )
+            self.block_1_conv3 = nn.Conv2d(
+                planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+
+            self.block_1_relu1 = nn.ReLU()
+            self.block_1_relu2 = nn.ReLU()
+            self.block_1_relu3 = nn.ReLU()
+
+            # Bottleneck 2
+            self.block_2_conv1 = nn.Conv2d(
+                self.expansion * planes, planes, kernel_size=1, bias=False
+            )
+            self.block_2_conv2 = nn.Conv2d(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                padding_mode="zeros",
+                bias=False,
+            )
+            self.block_2_conv3 = nn.Conv2d(
+                planes, self.expansion * planes, kernel_size=1, bias=False
+            )
+
+            self.block_2_relu1 = nn.ReLU()
+            self.block_2_relu2 = nn.ReLU()
+            self.block_2_relu3 = nn.ReLU()
+
+        def forward(self, x):
+            # **************** Bottleneck 0 ****************
+            block_0_conv1_out = (
+                self.block_0_conv1(x) * init_scale * block_0_weight_scale1
+            )
+            block_0_relu1_out = torch.clamp(
+                torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1),
+                min,
+                max,
+            )  # convert to int and apply relu
+            block_0_conv2_out = (
+                self.block_0_conv2(block_0_relu1_out)
+                * block_0_relu_1
+                * block_0_weight_scale2
+            )
+            block_0_relu2_out = torch.clamp(
+                torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2),
+                min,
+                max,
+            )
+            block_0_conv3_out = (
+                self.block_0_conv3(block_0_relu2_out)
+                * block_0_relu_2
+                * block_0_weight_scale3
+            )
+            block_0_rhf_same_scale = torch.clamp(
+                torch.round(block_0_conv3_out / init_scale), -128, 127
+            )
+
+            block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip
+            block_0_lhs_same_scale = torch.clamp(
+                torch.round(block_0_lhs_conv / init_scale), -128, 127
+            )
+            # convert to int and apply relu
+
+            block_0_skip_add = init_scale * (
+                block_0_rhf_same_scale + block_0_lhs_same_scale
+            )
+            block_0_final_out = torch.clamp(
+                torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3),
                 min,
                 max,
             )
-        )
-        return block_2_final_out
-
-
-# ------------------------------------------------------
-# Pytorch baseline
-# ------------------------------------------------------
-model = resnet_conv2_x_int8()
-model.eval()
-model.block_0_conv1.weight.data.copy_(block_0_int_weight_1)
-model.block_0_conv2.weight.data.copy_(block_0_int_weight_2)
-model.block_0_conv3.weight.data.copy_(block_0_int_weight_3)
-model.shortcut.weight.data.copy_(block_0_int_weight_skip)
-
-model.block_1_conv1.weight.data.copy_(block_1_int_weight_1)
-model.block_1_conv2.weight.data.copy_(block_1_int_weight_2)
-model.block_1_conv3.weight.data.copy_(block_1_int_weight_3)
-
-model.block_2_conv1.weight.data.copy_(block_2_int_weight_1)
-model.block_2_conv2.weight.data.copy_(block_2_int_weight_2)
-model.block_2_conv3.weight.data.copy_(block_2_int_weight_3)
-
-golden_output = model(int_inp)
-
-# ------------------------------------------------------
-# Reorder input data-layout
-# ------------------------------------------------------
-ds = DataShaper()
-before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
-before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
-ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
-
-block0_wts1 = ds.reorder_mat(
-    block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block0_wts2 = ds.reorder_mat(
-    block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block0_wts3 = ds.reorder_mat(
-    block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block0_wts_skip = ds.reorder_mat(
-    block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-
-total_wts = np.concatenate(
-    (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
-)
-
-block1_wts1 = ds.reorder_mat(
-    block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block1_wts2 = ds.reorder_mat(
-    block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block1_wts3 = ds.reorder_mat(
-    block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-
-total_wts2 = np.concatenate(
-    (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
-)
-
-block2_wts1 = ds.reorder_mat(
-    block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block2_wts2 = ds.reorder_mat(
-    block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-block2_wts3 = ds.reorder_mat(
-    block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
-)
-
-total_wts3 = np.concatenate(
-    (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
-)
-
-total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
-
-# ------------------------------------------------------
-# Main run loop
-# ------------------------------------------------------
-for i in range(num_iter):
-    start = time.time_ns()
-    aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3
-    stop = time.time_ns()
-
-    if enable_trace:
-        aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size)
-        write_out_trace(trace, trace_file)
-
-    npu_time = stop - start
-    npu_time_total = npu_time_total + npu_time
-
-# ------------------------------------------------------
-# Reorder output data-layout
-# ------------------------------------------------------
-temp_out = aie_output.reshape(32, 32, 32, 8)
-temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
-ofm_mem_fmt = temp_out.reshape(256, 32, 32)
-ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d")
-ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
-
-# ------------------------------------------------------
-# Compare the AIE output and the golden reference
-# ------------------------------------------------------
-print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
-
-assert np.allclose(
-    ofm_mem_fmt_out.detach().numpy(),
-    golden_output.detach().numpy(),
-    rtol=0,
-    atol=block_2_relu_3,
-)
-
-print("\nPASS!\n")
+            # **************** Bottleneck 1 ****************
+            block_1_conv1_out = (
+                self.block_1_conv1(block_0_final_out)
+                * block_0_relu_3
+                * block_1_weight_scale1
+            )
+            block_1_relu1_out = torch.clamp(
+                torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1),
+                min,
+                max,
+            )  # convert to int and apply relu
+            block_1_conv2_out = (
+                self.block_1_conv2(block_1_relu1_out)
+                * block_1_relu_1
+                * block_1_weight_scale2
+            )
+            block_1_relu2_out = torch.clamp(
+                torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2),
+                min,
+                max,
+            )
+            block_1_conv3_out = (
+                self.block_1_conv3(block_1_relu2_out)
+                * block_1_relu_2
+                * block_1_weight_scale3
+            )
+            block_1_rhf_same_scale = torch.clamp(
+                torch.round(block_1_conv3_out / block_0_relu_3), -128, 127
+            )
+
+            block_1_skip_add = block_0_relu_3 * (
+                block_1_rhf_same_scale + block_0_final_out
+            )
+            block_1_final_out = torch.clamp(
+                torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3),
+                min,
+                max,
+            )
+
+            # **************** Bottleneck 2 ****************
+            block_2_conv1_out = (
+                self.block_2_conv1(block_1_final_out)
+                * block_1_relu_3
+                * block_2_weight_scale1
+            )
+            block_2_relu1_out = torch.clamp(
+                torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1),
+                min,
+                max,
+            )  # convert to int and apply relu
+            block_2_conv2_out = (
+                self.block_2_conv2(block_2_relu1_out)
+                * block_2_relu_1
+                * block_2_weight_scale2
+            )
+            block_2_relu2_out = torch.clamp(
+                torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2),
+                min,
+                max,
+            )
+            block_2_conv3_out = (
+                self.block_2_conv3(block_2_relu2_out)
+                * block_2_relu_2
+                * block_2_weight_scale3
+            )
+            block_2_rhf_same_scale = torch.clamp(
+                torch.round(block_2_conv3_out / block_1_relu_3), -128, 127
+            )
+
+            block_2_skip_add = block_1_relu_3 * (
+                block_2_rhf_same_scale + block_1_final_out
+            )
+            block_2_final_out = block_2_relu_3 * (
+                torch.clamp(
+                    torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3),
+                    min,
+                    max,
+                )
+            )
+            return block_2_final_out
+
+    # ------------------------------------------------------
+    # Pytorch baseline
+    # ------------------------------------------------------
+    model = resnet_conv2_x_int8()
+    model.eval()
+    model.block_0_conv1.weight.data.copy_(block_0_int_weight_1)
+    model.block_0_conv2.weight.data.copy_(block_0_int_weight_2)
+    model.block_0_conv3.weight.data.copy_(block_0_int_weight_3)
+    model.shortcut.weight.data.copy_(block_0_int_weight_skip)
+
+    model.block_1_conv1.weight.data.copy_(block_1_int_weight_1)
+    model.block_1_conv2.weight.data.copy_(block_1_int_weight_2)
+    model.block_1_conv3.weight.data.copy_(block_1_int_weight_3)
+
+    model.block_2_conv1.weight.data.copy_(block_2_int_weight_1)
+    model.block_2_conv2.weight.data.copy_(block_2_int_weight_2)
+    model.block_2_conv3.weight.data.copy_(block_2_int_weight_3)
+
+    golden_output = model(int_inp)
+
+    # ------------------------------------------------------
+    # Reorder input data-layout
+    # ------------------------------------------------------
+    ds = DataShaper()
+    before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
+    before_input.tofile(
+        log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
+    )
+    ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX")
+    ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d")
+
+    block0_wts1 = ds.reorder_mat(
+        block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block0_wts2 = ds.reorder_mat(
+        block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block0_wts3 = ds.reorder_mat(
+        block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block0_wts_skip = ds.reorder_mat(
+        block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+
+    total_wts = np.concatenate(
+        (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None
+    )
+
+    block1_wts1 = ds.reorder_mat(
+        block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block1_wts2 = ds.reorder_mat(
+        block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block1_wts3 = ds.reorder_mat(
+        block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+
+    total_wts2 = np.concatenate(
+        (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None
+    )
+
+    block2_wts1 = ds.reorder_mat(
+        block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block2_wts2 = ds.reorder_mat(
+        block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+    block2_wts3 = ds.reorder_mat(
+        block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX"
+    )
+
+    total_wts3 = np.concatenate(
+        (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None
+    )
+
+    total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
+
+    # ------------------------------------------------------
+    # Main run loop
+    # ------------------------------------------------------
+    for i in range(num_iter):
+        start = time.time_ns()
+        aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3
+        stop = time.time_ns()
+
+        if enable_trace:
+            aie_output, trace = extract_trace(
+                aie_output, shape_out, dtype_out, trace_size
+            )
+            write_out_trace(trace, trace_file)
+
+        npu_time = stop - start
+        npu_time_total = npu_time_total + npu_time
+
+    # ------------------------------------------------------
+    # Reorder output data-layout
+    # ------------------------------------------------------
+    temp_out = aie_output.reshape(32, 32, 32, 8)
+    temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD")
+    ofm_mem_fmt = temp_out.reshape(256, 32, 32)
+    ofm_mem_fmt.tofile(
+        log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
+    )
+    ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
+
+    # ------------------------------------------------------
+    # Compare the AIE output and the golden reference
+    # ------------------------------------------------------
+    print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
+
+    assert np.allclose(
+        ofm_mem_fmt_out.detach().numpy(),
+        golden_output.detach().numpy(),
+        rtol=0,
+        atol=block_2_relu_3,
+    )
+
+    print("\nPASS!\n")
+
+
+if __name__ == "__main__":
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    main(opts)
diff --git a/programming_guide/section-6/README.md b/programming_guide/section-6/README.md
index 83e8899002..f54c812ab3 100644
--- a/programming_guide/section-6/README.md
+++ b/programming_guide/section-6/README.md
@@ -26,8 +26,14 @@ There are a number of example designs available [here](../../programming_example
 
 | Design name | Data type | Description | 
 |-|-|-|
-|[bottleneck](../../programming_examples/ml/bottleneck/)|ui8|A Bottleneck Residual Block is a variant of the residual block that utilises 1x1 convolutions to create a bottleneck. The use of a bottleneck reduces the number of parameters and computations.|
+|[bottleneck](../../programming_examples/ml/bottleneck/)|ui8|A Bottleneck Residual Block is a variant of the residual block that utilises three convolutions, using 1x1, 3x3 and 1x1 filter sizes, respectively. The use of a bottleneck reduces the number of parameters and computations.|
 |[resnet](../../programming_examples/ml/resnet/)|ui8|ResNet with offloaded conv2_x bottleneck blocks. The implementation features kernel fusion and dataflow optimizations highlighting the unique architectural capabilties of AI Engines.|
 
+## Exercises
+
+1. In [bottlneck](../../programming_examples/ml/bottleneck/) design following a dataflow approach, how many elements does the 3x3 convolution operation require to proceed with its computation? <img src="../../mlir_tutorials/images/answer1.jpg" title="3. This allows for the necessary neighborhood information required by the convolutional kernel to be available for processing." height=25>
+2. Suppose you have a bottleneck block with input dimensions of 32x32x256. After passing through the 1x1 convolutional layer, the output dimensions become 32x32x64. What would be the output dimensions after the subsequent 3x3 convolutional layer, assuming a stride of 1 and no padding and output channel of 64? <img src="../../mlir_tutorials/images/answer1.jpg" title="30×30×64. Without padding, the spatial dimensions would shrink by two pixels in each dimension due to the 3x3 convolution operation." height=25>
+
 -----
 [[Prev - Section 5](../section-5/)] [[Top](..)]
+