diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index a57dff389c..c8486817a0 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -100,4 +100,4 @@ clean: clean_trace
 
 .PHONY: clean_trace
 clean_trace:
-	rm -rf tmpTrace parse*.json 
+	rm -rf tmpTrace parse*.json  trace.txt
diff --git a/programming_guide/assets/trace_vector_scalar_mul1.png b/programming_guide/assets/trace_vector_scalar_mul1.png
new file mode 100755
index 0000000000..0e63467715
Binary files /dev/null and b/programming_guide/assets/trace_vector_scalar_mul1.png differ
diff --git a/programming_guide/quick_reference.md b/programming_guide/quick_reference.md
index 4e0c5d11b7..fff7e14cbd 100644
--- a/programming_guide/quick_reference.md
+++ b/programming_guide/quick_reference.md
@@ -49,6 +49,12 @@
 | `print(ctx.module)` | Converts our ctx wrapped structural code to mlir and prints to stdout|
 | `ctx.module.operation.verify()` | Runs additional structural verficiation on the python binded source code and return result to stdout |
 
+## Common AIE API functions for Kernel Programming
+| Function Signature  | Definition | Parameters | Return Type | Example | 
+|---------------------|------------|------------|-------------|---------|
+| `aie::vector<T, vec_factor> my_vector` | Declare vector type | `T`: data type <br> `vec_factor`: vector width | n/a | aie::vector<int16_t, 32> my_vector; |
+| `aie::load_v<vec_factor>(pA1);` | Vector load | `vec_factor`: vector width | `aie::vector` | aie::vector<int16_t, 32> my_vector; |
+
 ## Helpful AI Engine Architecture References and Tables
 * [AIE2 - Table of supported data types and vector sizes (AIE API)](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html)
 
diff --git a/programming_guide/section-1/Makefile b/programming_guide/section-1/Makefile
index 9a89112879..1a3d65de9a 100644
--- a/programming_guide/section-1/Makefile
+++ b/programming_guide/section-1/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../../tutorials/makefile-common
+include ../../programming_examples/makefile-common
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
diff --git a/programming_guide/section-3/Makefile b/programming_guide/section-3/Makefile
index eb57eeb40b..77688005e3 100644
--- a/programming_guide/section-3/Makefile
+++ b/programming_guide/section-3/Makefile
@@ -12,11 +12,15 @@ all: build/final.xclbin build/insts.txt
 
 targetname = vectorScalar
 
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+	
 build/scale.o: vector_scalar_mul.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
 
-build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o
+build/final.xclbin: build/aie.mlir build/scale.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
 				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md
index c91095d68f..166cab665e 100644
--- a/programming_guide/section-3/README.md
+++ b/programming_guide/section-3/README.md
@@ -149,7 +149,6 @@ To compile the design and C++ testbench:
 
 ```sh
 make
-make build/vectorScalar.exe
 ```
 
 To run the design:
diff --git a/programming_guide/section-3/test.cpp b/programming_guide/section-3/test.cpp
index c5690e127d..0698905f19 100644
--- a/programming_guide/section-3/test.cpp
+++ b/programming_guide/section-3/test.cpp
@@ -34,13 +34,10 @@ int main(int argc, const char *argv[]) {
 
   test_utils::parse_options(argc, argv, desc, vm);
   int verbosity = vm["verbosity"].as<int>();
-  int trace_size = vm["trace_sz"].as<int>();
 
   constexpr bool VERIFY = true;
-  constexpr bool ENABLE_TRACING = false;
-  // constexpr int TRACE_SIZE = 8192;
   constexpr int IN_SIZE = 4096;
-  constexpr int OUT_SIZE = ENABLE_TRACING ? IN_SIZE + trace_size / 4 : IN_SIZE;
+  constexpr int OUT_SIZE = IN_SIZE;
 
   // Load instruction sequence
   std::vector<uint32_t> instr_v =
@@ -64,7 +61,7 @@ int main(int argc, const char *argv[]) {
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
   auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
                              XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE) + trace_size,
+  auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE),
                          XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
 
   if (verbosity >= 1)
@@ -85,7 +82,7 @@ int main(int argc, const char *argv[]) {
 
   // Zero out buffer bo_outC
   DATATYPE *bufOut = bo_outC.map<DATATYPE *>();
-  memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE) + trace_size);
+  memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE));
 
   // sync host to device memories
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
@@ -120,11 +117,6 @@ int main(int argc, const char *argv[]) {
     }
   }
 
-  if (trace_size > 0) {
-    test_utils::write_out_trace(((char *)bufOut) + (IN_SIZE * sizeof(DATATYPE)),
-                                trace_size, vm["trace_file"].as<std::string>());
-  }
-
   // Print Pass/Fail result of our test
   if (!errors) {
     std::cout << std::endl << "PASS!" << std::endl << std::endl;
diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py
index bfdc33cbea..4028e889b6 100644
--- a/programming_guide/section-3/test.py
+++ b/programming_guide/section-3/test.py
@@ -15,7 +15,6 @@
 from aie.extras.dialects.ext import memref, arith
 
 import aie.utils.test as test_utils
-import aie.utils.trace as trace_utils
 
 
 def main(opts):
@@ -41,7 +40,7 @@ def main(opts):
     INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
     INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
 
-    OUT_SIZE = INOUT2_SIZE + int(opts.trace_size)
+    OUT_SIZE = INOUT2_SIZE
 
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
@@ -99,11 +98,6 @@ def main(opts):
         e = np.equal(output_buffer, ref)
         errors = errors + np.size(e) - np.count_nonzero(e)
 
-    # Write trace values if trace_size > 0
-    if opts.trace_size > 0:
-        trace_buffer = entire_buffer[INOUT2_VOLUME:]
-        trace_utils.write_out_trace(trace_buffer, str(opts.trace_file))
-
     # ------------------------------------------------------
     # Print verification and timing results
     # ------------------------------------------------------
diff --git a/programming_guide/section-4/CMakeLists.txt b/programming_guide/section-4/CMakeLists.txt
deleted file mode 100644
index 6b330f21c1..0000000000
--- a/programming_guide/section-4/CMakeLists.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
-
-# parameters
-# -DBOOST_ROOT: Path to Boost install
-# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
-# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
-# -DTARGET_NAME: Target name to be built
-
-# cmake needs this line
-cmake_minimum_required(VERSION 3.1)
-
-find_program(WSL NAMES powershell.exe)
-
-if (NOT WSL)
-    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
-    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
-else()
-    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
-    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
-endif()
-
-set(TARGET_NAME test CACHE STRING "Target to be built")
-
-SET (ProjectName ${TARGET_NAME})
-SET (currentTarget ${TARGET_NAME})
-
-if ( WSL )
-	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
-endif ()
-
-project(${ProjectName})
-
-# Find packages
-find_package(Boost REQUIRED)
-
-add_executable(${currentTarget}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib/test_utils.cpp
-    test.cpp
-)
-
-target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
-
-target_include_directories (${currentTarget} PUBLIC 
-    ${XRT_INC_DIR}
-    ${Boost_INCLUDE_DIRS}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib
-)
-
-target_link_directories(${currentTarget} PUBLIC
-    ${XRT_LIB_DIR}
-    ${Boost_LIBRARY_DIRS}
-)
-
-if (NOT WSL)
-    target_link_libraries(${currentTarget} PUBLIC
-        xrt_coreutil
-        boost_program_options
-        boost_filesystem
-    )
-else()
-    target_link_libraries(${currentTarget} PUBLIC
-        xrt_coreutil
-    )
-endif()
diff --git a/programming_guide/section-4/aie2.py b/programming_guide/section-4/aie2.py
deleted file mode 100644
index 4231179c36..0000000000
--- a/programming_guide/section-4/aie2.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2023 AMD Inc.
-
-from aie.dialects.aie import *  # primary mlir-aie dialect definitions
-from aie.extras.context import mlir_mod_ctx  # mlir-aie context
-
-from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
-from aie.dialects.scf import *  # scf (strcutred control flow) dialect
-from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
-
-
-# AI Engine structural design function
-def my_first_aie_program():
-
-    # Dvice declaration - aie2 device NPU
-    @device(AIEDevice.npu)
-    def device_body():
-        # Memref types
-        memRef_8_ty = T.memref(8, T.i32())
-        memRef_16_ty = T.memref(16, T.i32())
-        memRef_32_ty = T.memref(32, T.i32())
-        memRef_64_ty = T.memref(64, T.i32())
-
-        # Tile declarations
-        ComputeTile = tile(0, 2)
-        ShimTile = tile(0, 0)
-
-        # Data movement with object FIFOs
-        # Input (from shim tile to compute tile)
-        of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
-
-        # Output (from compute tile to shim tile)
-        of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
-
-        # Compute tile body
-        @core(ComputeTile)
-        def core_body():
-            for _ in for_(8):
-                # Acquire input and output object FIFO objects
-                elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
-                elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
-
-                # Core functionality - load, add 1, store
-                for i in for_(8):
-                    v0 = memref.load(elem_in, [i])
-                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
-                    memref.store(v1, elem_out, [i])
-                    yield_([])
-
-                # Release input and output object FIFO objects
-                of_in0.release(ObjectFifoPort.Consume, 1)
-                of_out0.release(ObjectFifoPort.Produce, 1)
-                yield_([])
-
-        # To/from AIE-array data movement
-        @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
-        def sequence(inTensor, unused, outTensor):
-            npu_dma_memcpy_nd(
-                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
-            )
-            npu_dma_memcpy_nd(
-                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
-            )
-            npu_sync(column=0, row=0, direction=0, channel=0)
-
-
-# Declares that subsequent code is in mlir-aie context
-with mlir_mod_ctx() as ctx:
-    my_first_aie_program()  # Call design function within the mlir-aie context
-    print(ctx.module)  # Print the python-to-mlir conversion
diff --git a/programming_guide/section-4/section-4a/Makefile b/programming_guide/section-4/section-4a/Makefile
index ee28c567c4..3b0140656f 100644
--- a/programming_guide/section-4/section-4a/Makefile
+++ b/programming_guide/section-4/section-4a/Makefile
@@ -16,10 +16,14 @@ build/aie.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-build/final.xclbin: build/aie.mlir
+build/scale.o: vector_scalar_mul.cc
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
-		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
+
+build/final.xclbin: build/aie.mlir build/scale.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_guide/section-4/section-4a/README.md b/programming_guide/section-4/section-4a/README.md
index e373fcfc48..79acbb02fe 100644
--- a/programming_guide/section-4/section-4a/README.md
+++ b/programming_guide/section-4/section-4a/README.md
@@ -24,7 +24,7 @@ Adding the application timer is as simple as noting a start and stop time surrou
 
 ```c++
     auto start = std::chrono::high_resolution_clock::now();
-    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC);
     run.wait();
     auto stop = std::chrono::high_resolution_clock::now();
 
@@ -78,9 +78,6 @@ We can then compute and print the actual average, minimum and maximum run times.
 
 1. Let's set our iterations to 10 and run again with `make run` which recompiles our host code for `test.cpp`. What reported Avg NPU time do you see this time? <img src="../../../mlir_tutorials/images/answer1.jpg" title="Answer can be anywhere from 430-480us but is likely different than before" height=25>
 
-1. Let's change our design and increase the loop size of our kernel by a factor of 10. This involves changing the outer loop from 8 to 80. What reported times do you see now? <img src="../../../mlir_tutorials/images/answer1.jpg" title="? us" height=25>
-
-
 -----
 [[Up]](../../section-4) [[Next]](../section-4b)
 
diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py
index 3e1f7e59ab..b09f9d0637 100644
--- a/programming_guide/section-4/section-4a/aie2.py
+++ b/programming_guide/section-4/section-4a/aie2.py
@@ -5,75 +5,73 @@
 #
 # (c) Copyright 2023 AMD Inc.
 
-from aie.dialects.aie import *  # primary mlir-aie dialect definitions
-from aie.extras.context import mlir_mod_ctx  # mlir-aie context
+import sys
 
-from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
-from aie.dialects.scf import *  # scf (strcutred control flow) dialect
-from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
 
+import aie.utils.trace as trace_utils
 
-# AI Engine structural design function
-def my_first_aie_program():
 
-    # Dvice declaration - aie2 device NPU
+def my_vector_scalar():
+
     @device(AIEDevice.npu)
     def device_body():
-        # Memref types
-        memRef_8_ty = T.memref(8, T.i32())
-        memRef_16_ty = T.memref(16, T.i32())
-        memRef_32_ty = T.memref(32, T.i32())
-        memRef_64_ty = T.memref(64, T.i32())
-        memRef_640_ty = T.memref(640, T.i32())
+        memRef_ty = T.memref(1024, T.i32())
+
+        # AIE Core Function declarations
+        scale_scalar = external_func(
+            "vector_scalar_mul_aie_scalar",
+            inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()],
+        )
 
         # Tile declarations
-        ComputeTile = tile(0, 2)
         ShimTile = tile(0, 0)
+        ComputeTile2 = tile(0, 2)
 
-        # Data movement with object FIFOs
-        # Input (from shim tile to compute tile)
-        of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
-
-        # Output (from compute tile to shim tile)
-        of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
+        # AIE-array data movement with object fifos
+        of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+        of_factor = object_fifo(
+            "infactor", ShimTile, ComputeTile2, 2, T.memref(1, T.i32())
+        )
+        of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
 
-        # Compute tile body
-        @core(ComputeTile)
+        # Set up compute tiles
+        # Compute tile 2
+        @core(ComputeTile2, "scale.o")
         def core_body():
-            for _ in for_(8):
-                # Acquire input and output object FIFO objects
-                elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
-                elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
-
-                # Core functionality - load, add 1, store
-                for i in for_(8):
-                    v0 = memref.load(elem_in, [i])
-                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
-                    memref.store(v1, elem_out, [i])
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1)
+                # Number of sub-vector "tile" iterations
+                for _ in for_(4):
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    call(scale_scalar, [elem_in, elem_out, elem_factor, 1024])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
                     yield_([])
-
-                # Release input and output object FIFO objects
-                of_in0.release(ObjectFifoPort.Consume, 1)
-                of_out0.release(ObjectFifoPort.Produce, 1)
+                of_factor.release(ObjectFifoPort.Consume, 1)
                 yield_([])
 
         # To/from AIE-array data movement
-        @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
-        def sequence(inTensor, unused, outTensor):
-            npu_dma_memcpy_nd(
-                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
-            )
-            npu_dma_memcpy_nd(
-                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
-            )
+        tensor_ty = T.memref(4096, T.i32())
+        scalar_ty = T.memref(1, T.i32())
+
+        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        def sequence(A, F, C):
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
+            npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
-# Declares that subsequent code is in mlir-aie context
 with mlir_mod_ctx() as ctx:
-    my_first_aie_program()  # Call design function within the mlir-aie context
-    res = ctx.module.operation.verify()  # Verify mlir context
+    my_vector_scalar()
+    res = ctx.module.operation.verify()
     if res == True:
-        print(ctx.module)  # Print the python-to-mlir conversion
+        print(ctx.module)
     else:
         print(res)
diff --git a/programming_guide/section-4/section-4a/answers/aie2.py b/programming_guide/section-4/section-4a/answers/aie2.py
deleted file mode 100644
index 595e0c11d2..0000000000
--- a/programming_guide/section-4/section-4a/answers/aie2.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2023 AMD Inc.
-
-from aie.dialects.aie import *  # primary mlir-aie dialect definitions
-from aie.extras.context import mlir_mod_ctx  # mlir-aie context
-
-from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
-from aie.dialects.scf import *  # scf (strcutred control flow) dialect
-from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
-
-
-# AI Engine structural design function
-def my_first_aie_program():
-
-    # Dvice declaration - aie2 device NPU
-    @device(AIEDevice.npu)
-    def device_body():
-        # Memref types
-        memRef_8_ty = T.memref(8, T.i32())
-        memRef_16_ty = T.memref(16, T.i32())
-        memRef_32_ty = T.memref(32, T.i32())
-        memRef_64_ty = T.memref(64, T.i32())
-        memRef_640_ty = T.memref(640, T.i32())
-
-        # Tile declarations
-        ComputeTile = tile(0, 2)
-        ShimTile = tile(0, 0)
-
-        # Data movement with object FIFOs
-        # Input (from shim tile to compute tile)
-        of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
-
-        # Output (from compute tile to shim tile)
-        of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
-
-        # Compute tile body
-        @core(ComputeTile)
-        def core_body():
-            for _ in for_(80):
-                # Acquire input and output object FIFO objects
-                elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
-                elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
-
-                # Core functionality - load, add 1, store
-                for i in for_(8):
-                    v0 = memref.load(elem_in, [i])
-                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
-                    memref.store(v1, elem_out, [i])
-                    yield_([])
-
-                # Release input and output object FIFO objects
-                of_in0.release(ObjectFifoPort.Consume, 1)
-                of_out0.release(ObjectFifoPort.Produce, 1)
-                yield_([])
-
-        # To/from AIE-array data movement
-        @FuncOp.from_py_func(memRef_640_ty, memRef_64_ty, memRef_640_ty)
-        def sequence(inTensor, unused, outTensor):
-            npu_dma_memcpy_nd(
-                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 640]
-            )
-            npu_dma_memcpy_nd(
-                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 640]
-            )
-            npu_sync(column=0, row=0, direction=0, channel=0)
-
-
-# Declares that subsequent code is in mlir-aie context
-with mlir_mod_ctx() as ctx:
-    my_first_aie_program()  # Call design function within the mlir-aie context
-    res = ctx.module.operation.verify()  # Verify mlir context
-    if res == True:
-        print(ctx.module)  # Print the python-to-mlir conversion
-    else:
-        print(res)
diff --git a/programming_guide/section-4/section-4a/answers/test.cpp b/programming_guide/section-4/section-4a/answers/test.cpp
deleted file mode 100644
index d154a97425..0000000000
--- a/programming_guide/section-4/section-4a/answers/test.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include <boost/program_options.hpp>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#include "test_utils.h"
-
-#ifndef DATATYPES_USING_DEFINED
-#define DATATYPES_USING_DEFINED
-// ------------------------------------------------------
-// Configure this to match your buffer data type
-// ------------------------------------------------------
-using INOUT0_DATATYPE = std::uint32_t;
-using INOUT1_DATATYPE = std::uint32_t;
-using INOUT2_DATATYPE = std::uint32_t;
-#endif
-
-namespace po = boost::program_options;
-
-// ----------------------------------------------------------------------------
-// Verify results (specific to our design example)
-// ----------------------------------------------------------------------------
-template <typename Tout>
-int verify(int CSize, std::vector<Tout> C, int verbosity) {
-  int errors = 0;
-  for (uint32_t i = 0; i < CSize; i++) {
-    uint32_t ref = i + 2;
-    if (C[i] != ref) {
-      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
-      errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
-    }
-  }
-  return errors;
-}
-
-// ----------------------------------------------------------------------------
-// Main
-// ----------------------------------------------------------------------------
-int main(int argc, const char *argv[]) {
-
-  // ------------------------------------------------------
-  // Parse program arguments
-  // ------------------------------------------------------
-  po::options_description desc("Allowed options");
-  po::variables_map vm;
-  test_utils::add_default_options(desc);
-
-  test_utils::parse_options(argc, argv, desc, vm);
-  int verbosity = vm["verbosity"].as<int>();
-  int do_verify = vm["verify"].as<bool>();
-  int n_iterations = vm["iters"].as<int>();
-  int n_warmup_iterations = vm["warmup"].as<int>();
-  int trace_size = vm["trace_sz"].as<int>();
-
-  // ------------------------------------------------------
-  // Configure this to match your design's buffer size
-  // ------------------------------------------------------
-  int INOUT0_VOLUME = 640; // Input only, 64x uint32_t in this example
-  int INOUT1_VOLUME = 640; // Not used in this example
-  int INOUT2_VOLUME = 640; // Output only, 64x uint32_t in this example
-
-  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
-  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
-  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
-
-  // TODO Remove trace for now?
-  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
-
-  srand(time(NULL));
-
-  // Load instruction sequence
-  std::vector<uint32_t> instr_v =
-      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  // ------------------------------------------------------
-  // Get device, load the xclbin & kernel and register them
-  // ------------------------------------------------------
-  xrt::device device;
-  xrt::kernel kernel;
-
-  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
-                                   vm["xclbin"].as<std::string>(),
-                                   vm["kernel"].as<std::string>());
-
-  // ------------------------------------------------------
-  // Initialize input/ output buffer sizes and sync them
-  // ------------------------------------------------------
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inout0 =
-      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inout1 =
-      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  // Assumes trace will only be added to inout2
-  auto bo_inout2 =
-      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-
-  // Initialize instruction buffer
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  // Initialize Inout buffer 0
-  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
-  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
-  for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = i + 1;
-  // AVec.push_back(i + 1);
-  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
-
-  // Initialize Inout buffer 1
-  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
-  // std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
-  // for (int i = 0; i < INOUT1_VOLUME; i++)
-  //   BVec[i] = i + 1
-  //   //BVec.push_back(i + 1);
-  // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
-
-  // Initialize Inout buffer 2
-  char *bufInOut2 = bo_inout2.map<char *>();
-  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
-  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
-
-  // Sync buffers to update input buffer values
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  // ------------------------------------------------------
-  // Initialize run configs
-  // ------------------------------------------------------
-  unsigned num_iter = n_iterations + n_warmup_iterations;
-  float npu_time_total = 0;
-  float npu_time_min = 9999999;
-  float npu_time_max = 0;
-
-  int errors = 0;
-
-  // ------------------------------------------------------
-  // Main run loop
-  // ------------------------------------------------------
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    // Run kernel
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-    auto start = std::chrono::high_resolution_clock::now();
-    auto run =
-        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
-    run.wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    if (iter < n_warmup_iterations) {
-      /* Warmup iterations do not count towards average runtime. */
-      continue;
-    }
-
-    // Copy output results and verify they are correct
-    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
-    if (do_verify) {
-      if (verbosity >= 1) {
-        std::cout << "Verifying results ..." << std::endl;
-      }
-      auto vstart = std::chrono::system_clock::now();
-      errors = verify(INOUT2_VOLUME, CVec, verbosity);
-      auto vstop = std::chrono::system_clock::now();
-      float vtime =
-          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
-              .count();
-      if (verbosity >= 1) {
-        std::cout << "Verify time: " << vtime << "secs." << std::endl;
-      }
-    } else {
-      if (verbosity >= 1)
-        std::cout << "WARNING: results not verified." << std::endl;
-    }
-
-    // Write trace values if trace_size > 0
-    if (trace_size > 0) {
-      // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE,
-      // trace_size,
-      test_utils::write_out_trace(((char *)bufInOut2), trace_size,
-                                  vm["trace_file"].as<std::string>());
-    }
-
-    // Accumulate run times
-    float npu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    npu_time_total += npu_time;
-    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
-    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
-  }
-
-  // ------------------------------------------------------
-  // Print verification and timing results
-  // ------------------------------------------------------
-
-  // TODO - Mac count to guide gflops
-  float macs = 0;
-
-  std::cout << std::endl
-            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
-            << std::endl;
-  if (macs > 0)
-    std::cout << "Avg NPU gflops: "
-              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
-
-  std::cout << std::endl
-            << "Min NPU time: " << npu_time_min << "us." << std::endl;
-  if (macs > 0)
-    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
-              << std::endl;
-
-  std::cout << std::endl
-            << "Max NPU time: " << npu_time_max << "us." << std::endl;
-  if (macs > 0)
-    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
-              << std::endl;
-
-  if (!errors) {
-    std::cout << "\nPASS!\n\n";
-    return 0;
-  } else {
-    std::cout << "\nError count: " << errors << "\n\n";
-    std::cout << "\nFailed.\n\n";
-    return 1;
-  }
-}
diff --git a/programming_guide/section-4/section-4a/test.cpp b/programming_guide/section-4/section-4a/test.cpp
index 2ec8a0d1c3..a5af1576bf 100644
--- a/programming_guide/section-4/section-4a/test.cpp
+++ b/programming_guide/section-4/section-4a/test.cpp
@@ -1,4 +1,4 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,59 +8,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <boost/program_options.hpp>
 #include <cstdint>
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
 
 #include "test_utils.h"
+#include "xrt/xrt_bo.h"
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-// ------------------------------------------------------
-// Configure this to match your buffer data type
-// ------------------------------------------------------
-using INOUT0_DATATYPE = std::uint32_t;
-using INOUT1_DATATYPE = std::uint32_t;
-using INOUT2_DATATYPE = std::uint32_t;
+using DATATYPE = std::uint32_t; // Configure this to match your buffer data type
 #endif
 
-namespace po = boost::program_options;
+const int scaleFactor = 3;
 
-// ----------------------------------------------------------------------------
-// Verify results (specific to our design example)
-// ----------------------------------------------------------------------------
-template <typename Tout>
-int verify(int CSize, std::vector<Tout> C, int verbosity) {
-  int errors = 0;
-  for (uint32_t i = 0; i < CSize; i++) {
-    uint32_t ref = i + 2;
-    if (C[i] != ref) {
-      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
-      errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
-    }
-  }
-  return errors;
-}
+namespace po = boost::program_options;
 
-// ----------------------------------------------------------------------------
-// Main
-// ----------------------------------------------------------------------------
 int main(int argc, const char *argv[]) {
 
-  // ------------------------------------------------------
-  // Parse program arguments
-  // ------------------------------------------------------
+  // Program arguments parsing
   po::options_description desc("Allowed options");
   po::variables_map vm;
   test_utils::add_default_options(desc);
@@ -70,33 +37,19 @@ int main(int argc, const char *argv[]) {
   int do_verify = vm["verify"].as<bool>();
   int n_iterations = vm["iters"].as<int>();
   int n_warmup_iterations = vm["warmup"].as<int>();
-  int trace_size = vm["trace_sz"].as<int>();
-
-  // ------------------------------------------------------
-  // Configure this to match your design's buffer size
-  // ------------------------------------------------------
-  int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example
-  int INOUT1_VOLUME = 64; // Not used in this example
-  int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example
-
-  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
-  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
-  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
 
-  // TODO Remove trace for now?
-  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
-
-  srand(time(NULL));
+  constexpr bool VERIFY = true;
+  constexpr int IN_SIZE = 4096;
+  constexpr int OUT_SIZE = IN_SIZE;
 
   // Load instruction sequence
   std::vector<uint32_t> instr_v =
       test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // ------------------------------------------------------
-  // Get device, load the xclbin & kernel and register them
-  // ------------------------------------------------------
+  // Start the XRT context and load the kernel
   xrt::device device;
   xrt::kernel kernel;
 
@@ -104,52 +57,41 @@ int main(int argc, const char *argv[]) {
                                    vm["xclbin"].as<std::string>(),
                                    vm["kernel"].as<std::string>());
 
-  // ------------------------------------------------------
-  // Initialize input/ output buffer sizes and sync them
-  // ------------------------------------------------------
+  // set up the buffer objects
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inout0 =
-      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inout1 =
-      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  // Assumes trace will only be added to inout2
-  auto bo_inout2 =
-      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
+                             XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects.\n";
 
-  // Initialize instruction buffer
+  // Copy instruction stream to xrt buffer object
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
 
-  // Initialize Inout buffer 0
-  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
-  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
-  for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = i + 1;
-  // AVec.push_back(i + 1);
-  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
-
-  // Initialize Inout buffer 1
-  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
-  // std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
-  // for (int i = 0; i < INOUT1_VOLUME; i++)
-  //   BVec[i] = i + 1
-  //   //BVec.push_back(i + 1);
-  // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
-
-  // Initialize Inout buffer 2
-  char *bufInOut2 = bo_inout2.map<char *>();
-  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
-  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
-
-  // Sync buffers to update input buffer values
+  // Initialize buffer bo_inA
+  DATATYPE *bufInA = bo_inA.map<DATATYPE *>();
+  for (int i = 0; i < IN_SIZE; i++)
+    bufInA[i] = i + 1;
+
+  // Initialize buffer bo_inFactor
+  DATATYPE *bufInFactor = bo_inFactor.map<DATATYPE *>();
+  *bufInFactor = scaleFactor;
+
+  // Zero out buffer bo_outC
+  DATATYPE *bufOut = bo_outC.map<DATATYPE *>();
+  memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE));
+
+  // sync host to device memories
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
   // ------------------------------------------------------
   // Initialize run configs
@@ -170,11 +112,12 @@ int main(int argc, const char *argv[]) {
     if (verbosity >= 1)
       std::cout << "Running Kernel.\n";
     auto start = std::chrono::high_resolution_clock::now();
-    auto run =
-        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC);
     run.wait();
     auto stop = std::chrono::high_resolution_clock::now();
-    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    // Sync device to host memories
+    bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
     if (iter < n_warmup_iterations) {
       /* Warmup iterations do not count towards average runtime. */
@@ -182,13 +125,26 @@ int main(int argc, const char *argv[]) {
     }
 
     // Copy output results and verify they are correct
-    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
+    // Copy output results and verify they are correct
     if (do_verify) {
       if (verbosity >= 1) {
         std::cout << "Verifying results ..." << std::endl;
       }
       auto vstart = std::chrono::system_clock::now();
-      errors = verify(INOUT2_VOLUME, CVec, verbosity);
+      for (uint32_t i = 0; i < IN_SIZE; i++) {
+        int32_t ref = bufInA[i] * scaleFactor;
+        int32_t test = bufOut[i];
+        if (test != ref) {
+          if (verbosity >= 1)
+            std::cout << "Error in output " << test << " != " << ref
+                      << std::endl;
+          errors++;
+        } else {
+          if (verbosity >= 1)
+            std::cout << "Correct output " << test << " == " << ref
+                      << std::endl;
+        }
+      }
       auto vstop = std::chrono::system_clock::now();
       float vtime =
           std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
@@ -201,14 +157,6 @@ int main(int argc, const char *argv[]) {
         std::cout << "WARNING: results not verified." << std::endl;
     }
 
-    // Write trace values if trace_size > 0
-    if (trace_size > 0) {
-      // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE,
-      // trace_size,
-      test_utils::write_out_trace(((char *)bufInOut2), trace_size,
-                                  vm["trace_file"].as<std::string>());
-    }
-
     // Accumulate run times
     float npu_time =
         std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
@@ -245,12 +193,15 @@ int main(int argc, const char *argv[]) {
     std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
               << std::endl;
 
+  // Print Pass/Fail result of our test
   if (!errors) {
-    std::cout << "\nPASS!\n\n";
+    std::cout << std::endl << "PASS!" << std::endl << std::endl;
     return 0;
   } else {
-    std::cout << "\nError count: " << errors << "\n\n";
-    std::cout << "\nFailed.\n\n";
+    std::cout << std::endl
+              << errors << " mismatches." << std::endl
+              << std::endl;
+    std::cout << std::endl << "fail." << std::endl << std::endl;
     return 1;
   }
 }
diff --git a/programming_guide/section-4/section-4a/test.py b/programming_guide/section-4/section-4a/test.py
index 0e82d741cb..887586472d 100644
--- a/programming_guide/section-4/section-4a/test.py
+++ b/programming_guide/section-4/section-4a/test.py
@@ -8,22 +8,13 @@
 import sys
 import time
 
-import aie.utils.test as test_utils
-
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
-INOUT1_VOLUME = 64  # Not used in this example
-INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
 
-INOUT0_DATATYPE = np.uint32
-INOUT1_DATATYPE = np.uint32
-INOUT2_DATATYPE = np.uint32
-
-INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
-INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
-INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+import aie.utils.test as test_utils
 
 
 def main(opts):
@@ -34,6 +25,21 @@ def main(opts):
         instr_text = [l for l in instr_text if l != ""]
         instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
 
+    # ------------------------------------------------------------
+    # Configure this to match your design's buffer size and type
+    # ------------------------------------------------------------
+    INOUT0_VOLUME = int(4096)  # Input only, 64x uint32_t in this example
+    INOUT1_VOLUME = int(1)  # Input only, 1 uint32_t scale factor
+    INOUT2_VOLUME = int(4096)  # Output only, 64x uint32_t in this example
+
+    INOUT0_DATATYPE = np.int32
+    INOUT1_DATATYPE = np.int32
+    INOUT2_DATATYPE = np.int32
+
+    INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+    INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+    INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
     OUT_SIZE = INOUT2_SIZE
 
     # ------------------------------------------------------
@@ -47,7 +53,6 @@ def main(opts):
     bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
     bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
     bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
     bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4))
 
     # Initialize instruction buffer
@@ -55,10 +60,10 @@ def main(opts):
 
     # Initialize data buffers
     inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
-    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
-    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
+    scale_factor = np.array([3], dtype=INOUT1_DATATYPE)
+    inout2 = np.zeros(OUT_SIZE, dtype=np.uint8)
     bo_inout0.write(inout0, 0)
-    bo_inout1.write(inout1, 0)
+    bo_inout1.write(scale_factor, 0)
     bo_inout2.write(inout2, 0)
 
     # Sync buffers to update input buffer values
@@ -94,12 +99,12 @@ def main(opts):
             continue
 
         # Copy output results and verify they are correct
-        out_size = INOUT2_SIZE
-        output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
+        entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
+        output_buffer = entire_buffer[:INOUT2_VOLUME]
         if opts.verify:
             if opts.verbosity >= 1:
                 print("Verifying results ...")
-            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
+            ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor
             e = np.equal(output_buffer, ref)
             errors = errors + np.size(e) - np.count_nonzero(e)
 
@@ -128,5 +133,6 @@ def main(opts):
 
 
 if __name__ == "__main__":
-    opts = test_utils.parse_args(sys.argv[1:])
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
     main(opts)
diff --git a/programming_guide/section-4/section-4a/vector_scalar_mul.cc b/programming_guide/section-4/section-4a/vector_scalar_mul.cc
new file mode 100755
index 0000000000..10c0aecbbc
--- /dev/null
+++ b/programming_guide/section-4/section-4a/vector_scalar_mul.cc
@@ -0,0 +1,25 @@
+//===- vector_scaler_mul.cc -------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+extern "C" {
+
+void vector_scalar_mul_aie_scalar(int32_t *a, int32_t *c, int32_t *factor,
+                                  int32_t N) {
+  for (int i = 0; i < N; i++) {
+    c[i] = *factor * a[i];
+  }
+}
+
+} // extern "C"
diff --git a/programming_guide/section-4/section-4b/Makefile b/programming_guide/section-4/section-4b/Makefile
index 09126e5289..8b7b1cc434 100644
--- a/programming_guide/section-4/section-4b/Makefile
+++ b/programming_guide/section-4/section-4b/Makefile
@@ -18,10 +18,14 @@ build/aie.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-build/final.xclbin: build/aie.mlir
+build/scale.o: vector_scalar_mul.cc
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
-		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
+
+build/final.xclbin: build/aie.mlir build/scale.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
@@ -37,7 +41,6 @@ endif
 run: ${targetname}.exe build/final.xclbin build/insts.txt 
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
-
 run_py: build/final.xclbin build/insts.txt
 	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
diff --git a/programming_guide/section-4/section-4b/README.md b/programming_guide/section-4/section-4b/README.md
index aeb08d76b5..6a5a6d3832 100644
--- a/programming_guide/section-4/section-4b/README.md
+++ b/programming_guide/section-4/section-4b/README.md
@@ -124,16 +124,17 @@ As described in [python/utils](../../../python/utils) for `trace.py`, we configu
 | 1 | inout1 |
 | 2 | inout2 |
 
-An example of this is in the Vector Scalar Multiply example ([aie2.py](../../../programming_examples/basic/vector_scalar_mul/aie2.py)), where it uses the 2nd pattern above (input A, input B, output C + trace). In the vector scalar multiply case, A is used for the input vector and B for the scalar factor. Since we're sharing the trace data with the output buffer on `inout2`, we set `ddr_id=2`. In addition, we set the offset to be the output data buffer size since the trace data is appended after the data (`offset=N_in_bytes`). For our local design ([aie2.py](./aie.py)), we have variation of the 2nd pattern but the second inout buffer is unused (input A, unused, output C + trace). `ddr_id=2` is still used since our output buffer is mapped to `inout2` and our trace data offset is specified as `C_sz_in_bytes`.
+Our section-4b example is modeled after the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul). Here, we are using the second inout mapping pattern (inputA, inputB, outputC + trace) in our [aie2.py](./aie.py) source where `inout0` is called `A` (the vetor input), `inout1` is called `F` (the scalar input) and `inout2` is called `C` (the vector output). Since the trace is mapped to `inout2`, we set `ddr_id=2` and the offset to be the output data buffer size since the trace is appended after the data (`offset=4096*4`).
 
 Once [aie2.py](./aie2.py) is configured to output trace data through one of the 3 inout buffers with matching `ddr_id` config and `offset`, we turn our attention to the host code to read the DDR data and write it to a file.
 
-> **NOTE** In our example design, the [aie2.py](./aie2.py) and associated [Makefile](./Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabld build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. This is also true for the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul).
+
+> **NOTE** In our example design, the [aie2.py](./aie2.py) and associated [Makefile](./Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabled build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. This is also true for the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul).
 
 ### <u>(2a) C/C++ Host code ([test.cpp](./test.cpp))</u>
 The main changes needed for [test.cpp](./test.cpp) is the increase in the output buffer size to account for the trace buffer size, being careful to read only the output buffer portion when verifying correctness of the results. We also need to be sure to pass the correct buffer offset which points to the trace buffer data when calling `write_out_trace`. 
 
-You can see in [test.cpp](.test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](.Makefile). The `trace` target from the [Makefile](./Makefile) is shown below. 
+You can see in [test.cpp](./test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](./Makefile). The `trace` target from the [Makefile](./Makefile) is shown below. 
 
 ```Makefile
 trace: ${targetname}.exe build/final.xclbin build/insts.txt 
@@ -143,15 +144,14 @@ trace: ${targetname}.exe build/final.xclbin build/insts.txt
 Following the invocation of the executable, we call the `parse_trace.py` python script which we will cover in more detail in step 3. 
 Within the [test.cpp](./test.cpp), we redefine OUT_SIZE to be the sum of output buffer size (in bytes) and the trace buffer size. 
 ```c++
-    int OUT_SIZE = INOUT2_SIZE + trace_size;
+    int OUT_SIZE = IN_SIZE + trace_size;
 ```
-All subsuquent references to the output buffer size should use  `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `INOUT2_VOLUME`.
+All subsequent references to the output buffer size should use  `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `IN_SIZE`.
 
 Finally, the function to write the trace output to a file as defined in `aie.utils.trace` is `write_out_trace` and we need to pass it the pointer in the output buffer where the trace data begins, the trace buffer size and the trace file name (default is `trace.txt`).
 ```c++
-      test_utils::write_out_trace(
-          ((char *)bufInOut2) + INOUT2_SIZE,
-          trace_size, vm["trace_file"].as<std::string>());
+      test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
 ```
 
 ### <u>(2b) Python Host code ([test.py](./test.py))</u>
@@ -163,7 +163,7 @@ trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
 ```
 The python equivalent host code performs the same steps as the C/C++ host code as we redefine `OUT_SIZE` to include the `trace_size`.
 ```python
-    OUT_SIZE = INOUT1_SIZE + int(opts.trace_size)
+    OUT_SIZE = INOUT2_SIZE + int(opts.trace_size)
 ```
 During verification, the `output_buffer` excludes the trace data and uses the `read` function as follows:
 ```python
@@ -195,14 +195,21 @@ Open https://ui.perfetto.dev in your browser and then open up the waveform json
     * Check matching packet IDs for packet-routed flows. The packet flow ID must match the configured ID value in Trace Control 1 register or else the packets don't get routed.
 
 ## <u>Exercises</u>
-1. Let's give tracing a try. In this directory, we're been examining a design based off the `Vector Scalar Add` example. Run `make trace` to compile the design and generate a trace file and run the `prase_trace.py` script on it to generate the `trace_4b.json` waveform file. Open this in http://ui.perfetto.dev. if you zoom into the region of interest with the W and S to zoom in and out respectively and A adn D to pan left and right. You should seem a wave like the following:
-
-    <img src="../../assets/trace_vector_scalar_add1.png" title="AIE-ML Vector Unit." height=250>
+1. Let's give tracing a try. In this directory, we're been examining a local design based off the `Vector Scalar Mul` example. Run `make trace` to compile the design and generate a trace file and run the `prase_trace.py` script on it to generate the `trace_4b.json` waveform file. Open this in http://ui.perfetto.dev. if you zoom into the region of interest with the keyboard shortcut key W and S to zoom in and out respectively and A and D to pan left and right. You should seem a wave like the following:
 
-    Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? <img src="../../../mlir_tutorials/images/answer1.jpg" title="8" height=25> How many input and output chunks are there? <img src="../../../mlir_tutorials/images/answer1.jpg" title="8" height=25> This shoudl match iteration loop bounds in our exmple design.
+    <img src="../../assets/trace_vector_scalar_mul1.png" title="AIE-ML Vector Unit." height=250>
 
-1. **TODO** Additional questions about routing congestion for circuit switch and packet switch routes for trace packets? <img src="../../../mlir_tutorials/images/answer1.jpg" title="AMD!" height=25>
+    Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? <img src="../../../mlir_tutorials/images/answer1.jpg" title="1024" height=25> How many input and output chunks are there? <img src="../../../mlir_tutorials/images/answer1.jpg" title="4 inputs and 4 outputs (last output might be truncated in viewer)" height=25> This shoudl match iteration loop bounds in our exmple design.
 
+    Here, we notice a few signals worth mentioning.
+    * `Event0` - The event marking the beginning of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event0()` before the loop. This is generally a handy thing to do to attach an event to the beginning of our kernel.
+    * `Event1` - The event marking the end of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event1()` before the loop. Much like event0, attaching event1 to the end of our kernel is also helpful.
+    * `VectorInstr` - Vector instructions like vector MAC or vector load/store. Here, we are running a scalar implementation so there are no vector events.
+    * `PortRunning0` - Mapped to Port 0 which is by default configured to the S2MM0 input (DMA from stream to local memory)
+    * `PortRunning1` - Mapped to Port 1 which is by default configured to the MM2S0 output (DMA from local memory to stream)
+    * `LockStall` - Any locks that are stalled in the core
+    * `LockAcquiresInstr` - Any lock acquire requests
+    * `LockReleaseInstr` - Any lock release requests
 
 -----
 [[Prev]](../section-4a) [[Up]](../../section-4) [[Next]](../section-4c)
diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py
index a629daa0ce..87d4e85d13 100644
--- a/programming_guide/section-4/section-4b/aie2.py
+++ b/programming_guide/section-4/section-4b/aie2.py
@@ -5,95 +5,89 @@
 #
 # (c) Copyright 2023 AMD Inc.
 
-from aie.dialects.aie import *  # primary mlir-aie dialect definitions
-from aie.extras.context import mlir_mod_ctx  # mlir-aie context
+import sys
 
-from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
-from aie.dialects.scf import *  # scf (strcutred control flow) dialect
-from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
 
 import aie.utils.trace as trace_utils
 
 
-# AI Engine structural design function
-def my_first_aie_program():
+def my_vector_scalar():
 
     enableTrace = True
     trace_size = 8192
-    C_sz_in_bytes = 64 * 4
 
-    # Dvice declaration - aie2 device NPU
     @device(AIEDevice.npu)
     def device_body():
-        # Memref types
-        memRef_8_ty = T.memref(8, T.i32())
-        memRef_16_ty = T.memref(16, T.i32())
-        memRef_32_ty = T.memref(32, T.i32())
-        memRef_64_ty = T.memref(64, T.i32())
+        memRef_ty = T.memref(1024, T.i32())
+
+        # AIE Core Function declarations
+        scale_scalar = external_func(
+            "vector_scalar_mul_aie_scalar",
+            inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()],
+        )
 
         # Tile declarations
-        ComputeTile = tile(0, 2)
         ShimTile = tile(0, 0)
-
-        # Data movement with object FIFOs
-        # Input (from shim tile to compute tile)
-        of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
-
-        # Output (from compute tile to shim tile)
-        of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
-
-        # Compute tile body
-        @core(ComputeTile)
+        ComputeTile2 = tile(0, 2)
+
+        # AIE-array data movement with object fifos
+        of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+        of_factor = object_fifo(
+            "infactor", ShimTile, ComputeTile2, 2, T.memref(1, T.i32())
+        )
+        of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+
+        # Set up compute tiles
+        # Compute tile 2
+        @core(ComputeTile2, "scale.o")
         def core_body():
-            for _ in for_(0xFFFFFFFF):
-                # for _ in for_(8):
-                # Acquire input and output object FIFO objects
-                elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
-                elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
-
-                # Core functionality - load, add 1, store
-                for i in for_(8):
-                    v0 = memref.load(elem_in, [i])
-                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
-                    memref.store(v1, elem_out, [i])
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1)
+                # Number of sub-vector "tile" iterations
+                for _ in for_(4):
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    call(scale_scalar, [elem_in, elem_out, elem_factor, 1024])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
                     yield_([])
-
-                # Release input and output object FIFO objects
-                of_in0.release(ObjectFifoPort.Consume, 1)
-                of_out0.release(ObjectFifoPort.Produce, 1)
+                of_factor.release(ObjectFifoPort.Consume, 1)
                 yield_([])
 
         # Set up a circuit-switched flow from core to shim for tracing information
         if enableTrace:
-            flow(ComputeTile, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+            flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
 
         # To/from AIE-array data movement
-        @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
-        def sequence(inTensor, notUsed, outTensor):
+        tensor_ty = T.memref(4096, T.i32())
+        scalar_ty = T.memref(1, T.i32())
 
+        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        def sequence(A, F, C):
             if enableTrace:
                 trace_utils.configure_simple_tracing_aie2(
-                    ComputeTile,
+                    ComputeTile2,
                     ShimTile,
                     ddr_id=2,
                     size=trace_size,
-                    offset=C_sz_in_bytes,
+                    offset=4096 * 4,  # offset in bytes
                 )
 
-            npu_dma_memcpy_nd(
-                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
-            )
-            npu_dma_memcpy_nd(
-                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
-            )
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
+            npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
             npu_sync(column=0, row=0, direction=0, channel=0)
 
 
-# Declares that subsequent code is in mlir-aie context
 with mlir_mod_ctx() as ctx:
-    my_first_aie_program()  # Call design function within the mlir-aie context
-    res = ctx.module.operation.verify()  # Verify mlir context
+    my_vector_scalar()
+    res = ctx.module.operation.verify()
     if res == True:
-        print(ctx.module)  # Print the python-to-mlir conversion
+        print(ctx.module)
     else:
         print(res)
diff --git a/programming_guide/section-4/section-4b/test.cpp b/programming_guide/section-4/section-4b/test.cpp
index 6f775e5b54..4e27fd8780 100644
--- a/programming_guide/section-4/section-4b/test.cpp
+++ b/programming_guide/section-4/section-4b/test.cpp
@@ -1,4 +1,4 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,59 +8,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <boost/program_options.hpp>
 #include <cstdint>
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
 
 #include "test_utils.h"
+#include "xrt/xrt_bo.h"
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-// ------------------------------------------------------
-// Configure this to match your buffer data type
-// ------------------------------------------------------
-using INOUT0_DATATYPE = std::uint32_t;
-using INOUT1_DATATYPE = std::uint32_t;
-using INOUT2_DATATYPE = std::uint32_t;
+using DATATYPE = std::uint32_t; // Configure this to match your buffer data type
 #endif
 
-namespace po = boost::program_options;
+const int scaleFactor = 3;
 
-// ----------------------------------------------------------------------------
-// Verify results (specific to our design example)
-// ----------------------------------------------------------------------------
-template <typename Tout>
-int verify(int CSize, std::vector<Tout> C, int verbosity) {
-  int errors = 0;
-  for (uint32_t i = 0; i < CSize; i++) {
-    uint32_t ref = i + 2;
-    if (C[i] != ref) {
-      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
-      errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
-    }
-  }
-  return errors;
-}
+namespace po = boost::program_options;
 
-// ----------------------------------------------------------------------------
-// Main
-// ----------------------------------------------------------------------------
 int main(int argc, const char *argv[]) {
 
-  // ------------------------------------------------------
-  // Parse program arguments
-  // ------------------------------------------------------
+  // Program arguments parsing
   po::options_description desc("Allowed options");
   po::variables_map vm;
   test_utils::add_default_options(desc);
@@ -72,31 +39,20 @@ int main(int argc, const char *argv[]) {
   int n_warmup_iterations = vm["warmup"].as<int>();
   int trace_size = vm["trace_sz"].as<int>();
 
-  // ------------------------------------------------------
-  // Configure this to match your design's buffer size
-  // ------------------------------------------------------
-  int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example
-  int INOUT1_VOLUME = 64; // Not used in this example
-  int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example
-
-  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
-  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
-  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
-
-  // TODO Remove trace for now?
-  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
+  constexpr bool VERIFY = true;
+  constexpr int IN_VOLUME = 4096;
 
-  srand(time(NULL));
+  constexpr int IN_SIZE = IN_VOLUME * sizeof(DATATYPE);
+  int OUT_SIZE = IN_SIZE + trace_size;
 
   // Load instruction sequence
   std::vector<uint32_t> instr_v =
       test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // ------------------------------------------------------
-  // Get device, load the xclbin & kernel and register them
-  // ------------------------------------------------------
+  // Start the XRT context and load the kernel
   xrt::device device;
   xrt::kernel kernel;
 
@@ -104,61 +60,41 @@ int main(int argc, const char *argv[]) {
                                    vm["xclbin"].as<std::string>(),
                                    vm["kernel"].as<std::string>());
 
-  // ------------------------------------------------------
-  // Initialize input/ output buffer sizes and sync them
-  // ------------------------------------------------------
+  // set up the buffer objects
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inout0 =
-      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inout1 =
-      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  // Assumes trace will only be added to inout2
-  auto bo_inout2 =
+  auto bo_inA =
+      xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
+                             XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_outC =
       xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-  // auto bo_trace =
-  //     // xrt::bo(device, trace_size, XRT_BO_FLAGS_HOST_ONLY,
-  //     kernel.group_id(4)); xrt::bo(device, trace_size,
-  //     XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects.\n";
 
-  // Initialize instruction buffer
+  // Copy instruction stream to xrt buffer object
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
 
-  // Initialize Inout buffer 0
-  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
-  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
-  for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = i + 1;
-  // AVec.push_back(i + 1);
-  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
-
-  // Initialize Inout buffer 1
-  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
-  // std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
-  // for (int i = 0; i < INOUT1_VOLUME; i++)
-  //   BVec[i] = i + 1
-  //   //BVec.push_back(i + 1);
-  // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
-
-  // Initialize Inout buffer 2
-  char *bufInOut2 = bo_inout2.map<char *>();
-  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
-  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
-  // memset(bufInOut2, 0, INOUT2_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
-
-  // char *bufTrace = bo_trace.map<char *>();
-  // memset(bufTrace, 0, trace_size);
-
-  // Sync buffers to update input buffer values
+  // Initialize buffer bo_inA
+  DATATYPE *bufInA = bo_inA.map<DATATYPE *>();
+  for (int i = 0; i < IN_VOLUME; i++)
+    bufInA[i] = i + 1;
+
+  // Initialize buffer bo_inFactor
+  DATATYPE *bufInFactor = bo_inFactor.map<DATATYPE *>();
+  *bufInFactor = scaleFactor;
+
+  // Zero out buffer bo_outC
+  DATATYPE *bufOut = bo_outC.map<DATATYPE *>();
+  memset(bufOut, 0, OUT_SIZE);
+
+  // sync host to device memories
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  // bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
   // ------------------------------------------------------
   // Initialize run configs
@@ -179,14 +115,12 @@ int main(int argc, const char *argv[]) {
     if (verbosity >= 1)
       std::cout << "Running Kernel.\n";
     auto start = std::chrono::high_resolution_clock::now();
-    auto run =
-        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
-    // kernel(bo_instr, instr_v.size(), bo_inout0, bo_trace, bo_inout2);
+    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC);
     run.wait();
-    // sleep(3);
     auto stop = std::chrono::high_resolution_clock::now();
-    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-    // bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    // Sync device to host memories
+    bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
     if (iter < n_warmup_iterations) {
       /* Warmup iterations do not count towards average runtime. */
@@ -194,13 +128,26 @@ int main(int argc, const char *argv[]) {
     }
 
     // Copy output results and verify they are correct
-    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
+    // Copy output results and verify they are correct
     if (do_verify) {
       if (verbosity >= 1) {
         std::cout << "Verifying results ..." << std::endl;
       }
       auto vstart = std::chrono::system_clock::now();
-      errors = verify(INOUT2_VOLUME, CVec, verbosity);
+      for (uint32_t i = 0; i < IN_VOLUME; i++) {
+        int32_t ref = bufInA[i] * scaleFactor;
+        int32_t test = bufOut[i];
+        if (test != ref) {
+          if (verbosity >= 1)
+            std::cout << "Error in output " << test << " != " << ref
+                      << std::endl;
+          errors++;
+        } else {
+          if (verbosity >= 1)
+            std::cout << "Correct output " << test << " == " << ref
+                      << std::endl;
+        }
+      }
       auto vstop = std::chrono::system_clock::now();
       float vtime =
           std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
@@ -215,7 +162,7 @@ int main(int argc, const char *argv[]) {
 
     // Write trace values if trace_size > 0
     if (trace_size > 0) {
-      test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size,
+      test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size,
                                   vm["trace_file"].as<std::string>());
     }
 
@@ -255,12 +202,15 @@ int main(int argc, const char *argv[]) {
     std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
               << std::endl;
 
+  // Print Pass/Fail result of our test
   if (!errors) {
-    std::cout << "\nPASS!\n\n";
+    std::cout << std::endl << "PASS!" << std::endl << std::endl;
     return 0;
   } else {
-    std::cout << "\nError count: " << errors << "\n\n";
-    std::cout << "\nFailed.\n\n";
+    std::cout << std::endl
+              << errors << " mismatches." << std::endl
+              << std::endl;
+    std::cout << std::endl << "fail." << std::endl << std::endl;
     return 1;
   }
 }
diff --git a/programming_guide/section-4/section-4b/test.py b/programming_guide/section-4/section-4b/test.py
index a36dc5d5a7..e7f6628ba6 100644
--- a/programming_guide/section-4/section-4b/test.py
+++ b/programming_guide/section-4/section-4b/test.py
@@ -3,30 +3,20 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-# import argparse
 import numpy as np
 import pyxrt as xrt
 import sys
 import time
 
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
 import aie.utils.test as test_utils
 import aie.utils.trace as trace_utils
 
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
-INOUT1_VOLUME = 64  # Not used in this example
-INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
-
-INOUT0_DATATYPE = np.uint32
-INOUT1_DATATYPE = np.uint32
-INOUT2_DATATYPE = np.uint32
-
-INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
-INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
-INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
-
 
 def main(opts):
 
@@ -36,6 +26,21 @@ def main(opts):
         instr_text = [l for l in instr_text if l != ""]
         instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
 
+    # ------------------------------------------------------------
+    # Configure this to match your design's buffer size and type
+    # ------------------------------------------------------------
+    INOUT0_VOLUME = int(4096)  # Input only, 64x uint32_t in this example
+    INOUT1_VOLUME = int(1)  # Input only, 1 uint32_t scale factor
+    INOUT2_VOLUME = int(4096)  # Output only, 64x uint32_t in this example
+
+    INOUT0_DATATYPE = np.int32
+    INOUT1_DATATYPE = np.int32
+    INOUT2_DATATYPE = np.int32
+
+    INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+    INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+    INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
     OUT_SIZE = INOUT2_SIZE + int(opts.trace_size)
 
     # ------------------------------------------------------
@@ -49,7 +54,6 @@ def main(opts):
     bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
     bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
     bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
     bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4))
 
     # Initialize instruction buffer
@@ -57,10 +61,10 @@ def main(opts):
 
     # Initialize data buffers
     inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
-    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
-    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
+    scale_factor = np.array([3], dtype=INOUT1_DATATYPE)
+    inout2 = np.zeros(OUT_SIZE, dtype=np.uint8)
     bo_inout0.write(inout0, 0)
-    bo_inout1.write(inout1, 0)
+    bo_inout1.write(scale_factor, 0)
     bo_inout2.write(inout2, 0)
 
     # Sync buffers to update input buffer values
@@ -101,9 +105,8 @@ def main(opts):
         if opts.verify:
             if opts.verbosity >= 1:
                 print("Verifying results ...")
-            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
+            ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor
             e = np.equal(output_buffer, ref)
-            # e = np.equal(dput_buffer, ref)
             errors = errors + np.size(e) - np.count_nonzero(e)
 
         # Write trace values if trace_size > 0
@@ -136,5 +139,6 @@ def main(opts):
 
 
 if __name__ == "__main__":
-    opts = test_utils.parse_args(sys.argv[1:])
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
     main(opts)
diff --git a/programming_guide/section-4/section-4b/vector_scalar_mul.cc b/programming_guide/section-4/section-4b/vector_scalar_mul.cc
new file mode 100755
index 0000000000..b47fc34622
--- /dev/null
+++ b/programming_guide/section-4/section-4b/vector_scalar_mul.cc
@@ -0,0 +1,26 @@
+//===- vector_scaler_mul.cc -------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+extern "C" {
+
+void vector_scalar_mul_aie_scalar(int32_t *a, int32_t *c, int32_t *factor,
+                                  int32_t N) {
+  event0();
+  for (int i = 0; i < N; i++) {
+    c[i] = *factor * a[i];
+  }
+  event1();
+}
+} // extern "C"
diff --git a/programming_guide/section-4/section-4c/README.md b/programming_guide/section-4/section-4c/README.md
index d9109bc4e5..81146f42e7 100644
--- a/programming_guide/section-4/section-4c/README.md
+++ b/programming_guide/section-4/section-4c/README.md
@@ -17,7 +17,7 @@
 
 -----
 
-Now that we are able to measure the total application time ([section-4a](../section-4a/)) and have examined the kernel performance via tracing ([section-4b](../section-4b)), we will take a closer look at kernel vectorization. We will be using the [vector-scalar multiply example](../../../programming_examples/basic/vector_scalar_mul/) to illustrate kernel vectorization concepts.
+Now that we are able to measure the total application time ([section-4a](../section-4a/)) and have examined the kernel performance via tracing ([section-4b](../section-4b)), we will take a closer look at kernel vectorization. We will be using the [vector-scalar multiply example](../../../programming_examples/basic/vector_scalar_mul/) rather than a local copy of that same design to illustrate kernel vectorization concepts. Note that by default, that example design is working with 16-bit data (vs 32-bit of our local examples) and has `vectorized=True`.
 
 Go ahead and read the design example summary for [vector-scalar multiply](../../../programming_examples/basic/vector_scalar_mul/) first to get an idea of the different components of this example design. Then, let's take a closer look at the kernel source file ([scale.cc](../../../aie_kernels/aie2/scale.cc)).
 
diff --git a/programming_guide/section-4/test.cpp b/programming_guide/section-4/test.cpp
deleted file mode 100644
index 2ec8a0d1c3..0000000000
--- a/programming_guide/section-4/test.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include <boost/program_options.hpp>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#include "test_utils.h"
-
-#ifndef DATATYPES_USING_DEFINED
-#define DATATYPES_USING_DEFINED
-// ------------------------------------------------------
-// Configure this to match your buffer data type
-// ------------------------------------------------------
-using INOUT0_DATATYPE = std::uint32_t;
-using INOUT1_DATATYPE = std::uint32_t;
-using INOUT2_DATATYPE = std::uint32_t;
-#endif
-
-namespace po = boost::program_options;
-
-// ----------------------------------------------------------------------------
-// Verify results (specific to our design example)
-// ----------------------------------------------------------------------------
-template <typename Tout>
-int verify(int CSize, std::vector<Tout> C, int verbosity) {
-  int errors = 0;
-  for (uint32_t i = 0; i < CSize; i++) {
-    uint32_t ref = i + 2;
-    if (C[i] != ref) {
-      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
-      errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
-    }
-  }
-  return errors;
-}
-
-// ----------------------------------------------------------------------------
-// Main
-// ----------------------------------------------------------------------------
-int main(int argc, const char *argv[]) {
-
-  // ------------------------------------------------------
-  // Parse program arguments
-  // ------------------------------------------------------
-  po::options_description desc("Allowed options");
-  po::variables_map vm;
-  test_utils::add_default_options(desc);
-
-  test_utils::parse_options(argc, argv, desc, vm);
-  int verbosity = vm["verbosity"].as<int>();
-  int do_verify = vm["verify"].as<bool>();
-  int n_iterations = vm["iters"].as<int>();
-  int n_warmup_iterations = vm["warmup"].as<int>();
-  int trace_size = vm["trace_sz"].as<int>();
-
-  // ------------------------------------------------------
-  // Configure this to match your design's buffer size
-  // ------------------------------------------------------
-  int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example
-  int INOUT1_VOLUME = 64; // Not used in this example
-  int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example
-
-  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
-  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
-  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
-
-  // TODO Remove trace for now?
-  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
-
-  srand(time(NULL));
-
-  // Load instruction sequence
-  std::vector<uint32_t> instr_v =
-      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  // ------------------------------------------------------
-  // Get device, load the xclbin & kernel and register them
-  // ------------------------------------------------------
-  xrt::device device;
-  xrt::kernel kernel;
-
-  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
-                                   vm["xclbin"].as<std::string>(),
-                                   vm["kernel"].as<std::string>());
-
-  // ------------------------------------------------------
-  // Initialize input/ output buffer sizes and sync them
-  // ------------------------------------------------------
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inout0 =
-      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inout1 =
-      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  // Assumes trace will only be added to inout2
-  auto bo_inout2 =
-      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-
-  // Initialize instruction buffer
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  // Initialize Inout buffer 0
-  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
-  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
-  for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = i + 1;
-  // AVec.push_back(i + 1);
-  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
-
-  // Initialize Inout buffer 1
-  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
-  // std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
-  // for (int i = 0; i < INOUT1_VOLUME; i++)
-  //   BVec[i] = i + 1
-  //   //BVec.push_back(i + 1);
-  // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
-
-  // Initialize Inout buffer 2
-  char *bufInOut2 = bo_inout2.map<char *>();
-  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
-  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
-
-  // Sync buffers to update input buffer values
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  // ------------------------------------------------------
-  // Initialize run configs
-  // ------------------------------------------------------
-  unsigned num_iter = n_iterations + n_warmup_iterations;
-  float npu_time_total = 0;
-  float npu_time_min = 9999999;
-  float npu_time_max = 0;
-
-  int errors = 0;
-
-  // ------------------------------------------------------
-  // Main run loop
-  // ------------------------------------------------------
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    // Run kernel
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-    auto start = std::chrono::high_resolution_clock::now();
-    auto run =
-        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
-    run.wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    if (iter < n_warmup_iterations) {
-      /* Warmup iterations do not count towards average runtime. */
-      continue;
-    }
-
-    // Copy output results and verify they are correct
-    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
-    if (do_verify) {
-      if (verbosity >= 1) {
-        std::cout << "Verifying results ..." << std::endl;
-      }
-      auto vstart = std::chrono::system_clock::now();
-      errors = verify(INOUT2_VOLUME, CVec, verbosity);
-      auto vstop = std::chrono::system_clock::now();
-      float vtime =
-          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
-              .count();
-      if (verbosity >= 1) {
-        std::cout << "Verify time: " << vtime << "secs." << std::endl;
-      }
-    } else {
-      if (verbosity >= 1)
-        std::cout << "WARNING: results not verified." << std::endl;
-    }
-
-    // Write trace values if trace_size > 0
-    if (trace_size > 0) {
-      // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE,
-      // trace_size,
-      test_utils::write_out_trace(((char *)bufInOut2), trace_size,
-                                  vm["trace_file"].as<std::string>());
-    }
-
-    // Accumulate run times
-    float npu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    npu_time_total += npu_time;
-    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
-    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
-  }
-
-  // ------------------------------------------------------
-  // Print verification and timing results
-  // ------------------------------------------------------
-
-  // TODO - Mac count to guide gflops
-  float macs = 0;
-
-  std::cout << std::endl
-            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
-            << std::endl;
-  if (macs > 0)
-    std::cout << "Avg NPU gflops: "
-              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
-
-  std::cout << std::endl
-            << "Min NPU time: " << npu_time_min << "us." << std::endl;
-  if (macs > 0)
-    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
-              << std::endl;
-
-  std::cout << std::endl
-            << "Max NPU time: " << npu_time_max << "us." << std::endl;
-  if (macs > 0)
-    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
-              << std::endl;
-
-  if (!errors) {
-    std::cout << "\nPASS!\n\n";
-    return 0;
-  } else {
-    std::cout << "\nError count: " << errors << "\n\n";
-    std::cout << "\nFailed.\n\n";
-    return 1;
-  }
-}
diff --git a/programming_guide/section-4/test.py b/programming_guide/section-4/test.py
deleted file mode 100644
index 0e82d741cb..0000000000
--- a/programming_guide/section-4/test.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# test.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import numpy as np
-import pyxrt as xrt
-import sys
-import time
-
-import aie.utils.test as test_utils
-
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
-INOUT1_VOLUME = 64  # Not used in this example
-INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
-
-INOUT0_DATATYPE = np.uint32
-INOUT1_DATATYPE = np.uint32
-INOUT2_DATATYPE = np.uint32
-
-INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
-INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
-INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
-
-
-def main(opts):
-
-    # Load instruction sequence
-    with open(opts.instr, "r") as f:
-        instr_text = f.read().split("\n")
-        instr_text = [l for l in instr_text if l != ""]
-        instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
-
-    OUT_SIZE = INOUT2_SIZE
-
-    # ------------------------------------------------------
-    # Get device, load the xclbin & kernel and register them
-    # ------------------------------------------------------
-    (device, kernel) = test_utils.init_xrt_load_kernel(opts)
-
-    # ------------------------------------------------------
-    # Initialize input/ output buffer sizes and sync them
-    # ------------------------------------------------------
-    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
-    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
-    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
-    bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4))
-
-    # Initialize instruction buffer
-    bo_instr.write(instr_v, 0)
-
-    # Initialize data buffers
-    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
-    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
-    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
-    bo_inout0.write(inout0, 0)
-    bo_inout1.write(inout1, 0)
-    bo_inout2.write(inout2, 0)
-
-    # Sync buffers to update input buffer values
-    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    # ------------------------------------------------------
-    # Initialize run configs
-    # ------------------------------------------------------
-    num_iter = opts.iters + opts.warmup_iters
-    npu_time_total = 0
-    npu_time_min = 9999999
-    npu_time_max = 0
-    errors = 0
-
-    # ------------------------------------------------------
-    # Main run loop
-    # ------------------------------------------------------
-    for i in range(num_iter):
-        # Run kernel
-        if opts.verbosity >= 1:
-            print("Running Kernel.")
-        start = time.time_ns()
-        h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
-        h.wait()
-        stop = time.time_ns()
-        bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-
-        # Warmup iterations do not count towards average runtime.
-        if i < opts.warmup_iters:
-            continue
-
-        # Copy output results and verify they are correct
-        out_size = INOUT2_SIZE
-        output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
-        if opts.verify:
-            if opts.verbosity >= 1:
-                print("Verifying results ...")
-            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
-            e = np.equal(output_buffer, ref)
-            errors = errors + np.size(e) - np.count_nonzero(e)
-
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
-        npu_time_min = min(npu_time_min, npu_time)
-        npu_time_max = max(npu_time_max, npu_time)
-
-    # ------------------------------------------------------
-    # Print verification and timing results
-    # ------------------------------------------------------
-
-    # TODO - Mac count to guide gflops
-
-    print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000)))
-    print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000)))
-    print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000)))
-
-    if not errors:
-        print("\nPASS!\n")
-        exit(0)
-    else:
-        print("\nError count: ", errors)
-        print("\nFailed.\n")
-        exit(-1)
-
-
-if __name__ == "__main__":
-    opts = test_utils.parse_args(sys.argv[1:])
-    main(opts)
diff --git a/programming_guide/section-4/test_trace.cpp b/programming_guide/section-4/test_trace.cpp
deleted file mode 100644
index 2ec8a0d1c3..0000000000
--- a/programming_guide/section-4/test_trace.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include <boost/program_options.hpp>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#include "test_utils.h"
-
-#ifndef DATATYPES_USING_DEFINED
-#define DATATYPES_USING_DEFINED
-// ------------------------------------------------------
-// Configure this to match your buffer data type
-// ------------------------------------------------------
-using INOUT0_DATATYPE = std::uint32_t;
-using INOUT1_DATATYPE = std::uint32_t;
-using INOUT2_DATATYPE = std::uint32_t;
-#endif
-
-namespace po = boost::program_options;
-
-// ----------------------------------------------------------------------------
-// Verify results (specific to our design example)
-// ----------------------------------------------------------------------------
-template <typename Tout>
-int verify(int CSize, std::vector<Tout> C, int verbosity) {
-  int errors = 0;
-  for (uint32_t i = 0; i < CSize; i++) {
-    uint32_t ref = i + 2;
-    if (C[i] != ref) {
-      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
-      errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
-    }
-  }
-  return errors;
-}
-
-// ----------------------------------------------------------------------------
-// Main
-// ----------------------------------------------------------------------------
-int main(int argc, const char *argv[]) {
-
-  // ------------------------------------------------------
-  // Parse program arguments
-  // ------------------------------------------------------
-  po::options_description desc("Allowed options");
-  po::variables_map vm;
-  test_utils::add_default_options(desc);
-
-  test_utils::parse_options(argc, argv, desc, vm);
-  int verbosity = vm["verbosity"].as<int>();
-  int do_verify = vm["verify"].as<bool>();
-  int n_iterations = vm["iters"].as<int>();
-  int n_warmup_iterations = vm["warmup"].as<int>();
-  int trace_size = vm["trace_sz"].as<int>();
-
-  // ------------------------------------------------------
-  // Configure this to match your design's buffer size
-  // ------------------------------------------------------
-  int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example
-  int INOUT1_VOLUME = 64; // Not used in this example
-  int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example
-
-  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
-  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
-  size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
-
-  // TODO Remove trace for now?
-  size_t OUT_SIZE = INOUT2_SIZE + trace_size;
-
-  srand(time(NULL));
-
-  // Load instruction sequence
-  std::vector<uint32_t> instr_v =
-      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  // ------------------------------------------------------
-  // Get device, load the xclbin & kernel and register them
-  // ------------------------------------------------------
-  xrt::device device;
-  xrt::kernel kernel;
-
-  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
-                                   vm["xclbin"].as<std::string>(),
-                                   vm["kernel"].as<std::string>());
-
-  // ------------------------------------------------------
-  // Initialize input/ output buffer sizes and sync them
-  // ------------------------------------------------------
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inout0 =
-      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inout1 =
-      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  // Assumes trace will only be added to inout2
-  auto bo_inout2 =
-      xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-
-  // Initialize instruction buffer
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  // Initialize Inout buffer 0
-  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
-  std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
-  for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = i + 1;
-  // AVec.push_back(i + 1);
-  memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
-
-  // Initialize Inout buffer 1
-  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
-  // std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
-  // for (int i = 0; i < INOUT1_VOLUME; i++)
-  //   BVec[i] = i + 1
-  //   //BVec.push_back(i + 1);
-  // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
-
-  // Initialize Inout buffer 2
-  char *bufInOut2 = bo_inout2.map<char *>();
-  std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
-  memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
-
-  // Sync buffers to update input buffer values
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  // ------------------------------------------------------
-  // Initialize run configs
-  // ------------------------------------------------------
-  unsigned num_iter = n_iterations + n_warmup_iterations;
-  float npu_time_total = 0;
-  float npu_time_min = 9999999;
-  float npu_time_max = 0;
-
-  int errors = 0;
-
-  // ------------------------------------------------------
-  // Main run loop
-  // ------------------------------------------------------
-  for (unsigned iter = 0; iter < num_iter; iter++) {
-
-    // Run kernel
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-    auto start = std::chrono::high_resolution_clock::now();
-    auto run =
-        kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
-    run.wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-    bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    if (iter < n_warmup_iterations) {
-      /* Warmup iterations do not count towards average runtime. */
-      continue;
-    }
-
-    // Copy output results and verify they are correct
-    memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
-    if (do_verify) {
-      if (verbosity >= 1) {
-        std::cout << "Verifying results ..." << std::endl;
-      }
-      auto vstart = std::chrono::system_clock::now();
-      errors = verify(INOUT2_VOLUME, CVec, verbosity);
-      auto vstop = std::chrono::system_clock::now();
-      float vtime =
-          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
-              .count();
-      if (verbosity >= 1) {
-        std::cout << "Verify time: " << vtime << "secs." << std::endl;
-      }
-    } else {
-      if (verbosity >= 1)
-        std::cout << "WARNING: results not verified." << std::endl;
-    }
-
-    // Write trace values if trace_size > 0
-    if (trace_size > 0) {
-      // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE,
-      // trace_size,
-      test_utils::write_out_trace(((char *)bufInOut2), trace_size,
-                                  vm["trace_file"].as<std::string>());
-    }
-
-    // Accumulate run times
-    float npu_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-            .count();
-
-    npu_time_total += npu_time;
-    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
-    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
-  }
-
-  // ------------------------------------------------------
-  // Print verification and timing results
-  // ------------------------------------------------------
-
-  // TODO - Mac count to guide gflops
-  float macs = 0;
-
-  std::cout << std::endl
-            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
-            << std::endl;
-  if (macs > 0)
-    std::cout << "Avg NPU gflops: "
-              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
-
-  std::cout << std::endl
-            << "Min NPU time: " << npu_time_min << "us." << std::endl;
-  if (macs > 0)
-    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
-              << std::endl;
-
-  std::cout << std::endl
-            << "Max NPU time: " << npu_time_max << "us." << std::endl;
-  if (macs > 0)
-    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
-              << std::endl;
-
-  if (!errors) {
-    std::cout << "\nPASS!\n\n";
-    return 0;
-  } else {
-    std::cout << "\nError count: " << errors << "\n\n";
-    std::cout << "\nFailed.\n\n";
-    return 1;
-  }
-}
diff --git a/programming_guide/section-4/test_trace.py b/programming_guide/section-4/test_trace.py
deleted file mode 100644
index b6c0d99c02..0000000000
--- a/programming_guide/section-4/test_trace.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# test.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-# import argparse
-import numpy as np
-import pyxrt as xrt
-import sys
-import time
-
-import aie.utils.test as test_utils
-
-# ------------------------------------------------------
-# Configure this to match your design's buffer size
-# ------------------------------------------------------
-INOUT0_VOLUME = 64  # Input only, 64x uint32_t in this example
-INOUT1_VOLUME = 64  # Not used in this example
-INOUT2_VOLUME = 64  # Output only, 64x uint32_t in this example
-
-INOUT0_DATATYPE = np.uint32
-INOUT1_DATATYPE = np.uint32
-INOUT2_DATATYPE = np.uint32
-
-INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
-INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
-INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
-
-
-def main(opts):
-
-    # Load instruction sequence
-    with open(opts.instr, "r") as f:
-        instr_text = f.read().split("\n")
-        instr_text = [l for l in instr_text if l != ""]
-        instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
-
-    OUT_SIZE = INOUT2_SIZE + opts.trace_size
-
-    # ------------------------------------------------------
-    # Get device, load the xclbin & kernel and register them
-    # ------------------------------------------------------
-    (device, kernel) = init_xrt_load_kernel(opts)
-
-    # ------------------------------------------------------
-    # Initialize input/ output buffer sizes and sync them
-    # ------------------------------------------------------
-    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
-    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
-    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
-    bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4))
-
-    # Initialize instruction buffer
-    bo_instr.write(instr_v, 0)
-
-    # Initialize data buffers
-    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
-    inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
-    inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
-    bo_inout0.write(inout0, 0)
-    bo_inout1.write(inout1, 0)
-    bo_inout2.write(inout2, 0)
-
-    # Sync buffers to update input buffer values
-    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    # ------------------------------------------------------
-    # Initialize run configs
-    # ------------------------------------------------------
-    num_iter = opts.iters + opts.warmup_iters
-    npu_time_total = 0
-    npu_time_min = 9999999
-    npu_time_max = 0
-    errors = 0
-
-    # ------------------------------------------------------
-    # Main run loop
-    # ------------------------------------------------------
-    for i in range(num_iter):
-        # Run kernel
-        if opts.verbosity >= 1:
-            print("Running Kernel.")
-        start = time.time_ns()
-        h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
-        h.wait()
-        stop = time.time_ns()
-        bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-
-        # Warmup iterations do not count towards average runtime.
-        if i < opts.warmup_iters:
-            continue
-
-        # Copy output results and verify they are correct
-        out_size = INOUT2_SIZE + opts.trace_size
-        print("out_size:", out_size)
-        output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
-        dout_buffer = output_buffer[0 : INOUT2_VOLUME - 1]
-        trace_buffer = output_buffer[INOUT2_VOLUME - 1 :]
-        if opts.verify:
-            if opts.verbosity >= 1:
-                print("Verifying results ...")
-            ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
-            # e = np.equal(output_buffer, ref)
-            e = np.equal(dput_buffer, ref)
-            errors = errors + np.size(e) - np.count_nonzero(e)
-
-        # Write trace values if trace_size > 0
-        # if opts.trace_size > 0:
-        #     print("Do something with trace!")
-        #     test_utils.write_out_trace(trace_buffer, opts.trace_size, opts.trace_file)
-
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
-        npu_time_min = min(npu_time_min, npu_time)
-        npu_time_max = max(npu_time_max, npu_time)
-
-    # ------------------------------------------------------
-    # Print verification and timing results
-    # ------------------------------------------------------
-
-    # TODO - Mac count to guide gflops
-
-    print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000)))
-    print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000)))
-    print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000)))
-
-    if not errors:
-        print("\nPASS!\n")
-        exit(0)
-    else:
-        print("\nError count: ", errors)
-        print("\nFailed.\n")
-        exit(-1)
-
-
-if __name__ == "__main__":
-    opts = test_utils.parse_args(sys.argv[1:])
-    main(opts)