From 2b24c89f7889994243b429f9bb0c1a085d99fd1f Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 14:34:34 -0600
Subject: [PATCH 01/31] Introduce XRTRunner

---
 programming_examples/passthrough/common.py    | 51 -----------
 .../passthrough/passthrough_dma/Makefile      |  7 +-
 .../passthrough_dma/passthrough_dma.py        | 71 ++++++++++-----
 .../passthrough/passthrough_dma/run.py        | 43 ---------
 python/air/backend/xrt_runner.py              | 87 +++++++++++++++++++
 5 files changed, 143 insertions(+), 116 deletions(-)
 delete mode 100644 programming_examples/passthrough/common.py
 delete mode 100644 programming_examples/passthrough/passthrough_dma/run.py
 create mode 100644 python/air/backend/xrt_runner.py

diff --git a/programming_examples/passthrough/common.py b/programming_examples/passthrough/common.py
deleted file mode 100644
index 8acd7b6a9..000000000
--- a/programming_examples/passthrough/common.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: MIT
-
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-NUM_VECTORS = 4
-
-INOUT_DATATYPE = np.uint8
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-
-
-def test_main(build_module, vector_size, verbose=False, experimental_passes=False):
-    mlir_module = build_module(vector_size)
-
-    input_a = np.arange(1, vector_size + 1, dtype=INOUT_DATATYPE)
-    output_b = np.arange(1, vector_size + 1, dtype=INOUT_DATATYPE)
-    for i in range(vector_size):
-        input_a[i] = i % 0xFF
-        output_b[i] = 0xFF
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose,
-        experimental_passes=experimental_passes,
-        omit_while_true_loop=True,
-    )
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        copy = backend.compile_and_load(mlir_module)
-        (_, output_b) = copy(input_a, output_b)
-
-    backend.unload()
-
-    # check output, should have the top left filled in
-    errors = 0
-    for i in range(vector_size):
-        rb = output_b[i]
-
-        expected_value = i % 0xFF
-        if rb != expected_value:
-            print(f"IM {i} should be 0x{expected_value:x}, is 0x{rb:x}\n")
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
diff --git a/programming_examples/passthrough/passthrough_dma/Makefile b/programming_examples/passthrough/passthrough_dma/Makefile
index 122cb0624..e91fad2f8 100644
--- a/programming_examples/passthrough/passthrough_dma/Makefile
+++ b/programming_examples/passthrough/passthrough_dma/Makefile
@@ -6,9 +6,12 @@ targetname := $(shell basename ${srcdir})
 
 all: run
 
+print:
+	${powershell} python3 ${srcdir}/passthrough_dma.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/passthrough_dma.py
 
 clean:
 	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
index b3abfc3bc..e14cb97dc 100644
--- a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
+++ b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
@@ -1,36 +1,27 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
-from common import *
+INOUT_DATATYPE = np.uint8
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 
 
 @module_builder
-def build_module(vector_size):
-    assert vector_size % NUM_VECTORS == 0
+def build_module(vector_size, num_subvectors):
+    assert vector_size % num_subvectors == 0
 
     # chop input in 4 sub-tensors
-    lineWidthInBytes = vector_size // NUM_VECTORS
+    lineWidthInBytes = vector_size // num_subvectors
 
     # Type and method of input/output
     memrefTyInOut = T.memref(vector_size, T.ui8())
@@ -62,7 +53,7 @@ def segment_body(arg0, arg1):
                 @herd(name="copyherd", sizes=[1, 1], operands=[arg0, arg1])
                 def herd_body(tx, ty, sx, sy, a, b):
 
-                    for _i in range_(NUM_VECTORS):
+                    for _i in range_(num_subvectors):
                         # We must allocate a buffer of image size for the input/output
                         tensor_in = AllocOp(tensor_type, [], [])
                         tensor_out = AllocOp(tensor_type, [], [])
@@ -87,5 +78,45 @@ def herd_body(tx, ty, sx, sy, a, b):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-s",
+        "--vector_size",
+        type=int,
+        default=4096,
+        help="The size (in bytes) of the data vector to passthrough",
+    )
+    parser.add_argument(
+        "--subvector_size",
+        type=int,
+        default=4,
+        help="The number of sub-vectors to break the vector into",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(args.vector_size, args.subvector_size)
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    output_b = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    for i in range(args.vector_size):
+        input_a[i] = i % 0xFF
+        output_b[i] = i % 0xFF
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/passthrough/passthrough_dma/run.py b/programming_examples/passthrough/passthrough_dma/run.py
deleted file mode 100644
index ce11d8a59..000000000
--- a/programming_examples/passthrough/passthrough_dma/run.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from common import test_main
-from passthrough_dma.passthrough_dma import build_module
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough/passthrough_dma example",
-    )
-    parser.add_argument(
-        "-s",
-        "--vector_size",
-        type=int,
-        default=4096,
-        help="The size (in bytes) of the data vector to passthrough",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(
-        build_module, args.vector_size, experimental_passes=True, verbose=args.verbose
-    )
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
new file mode 100644
index 000000000..2124029fb
--- /dev/null
+++ b/python/air/backend/xrt_runner.py
@@ -0,0 +1,87 @@
+# ./python/air/backend/xrt_runner.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+from .xrt import XRTBackend
+import filelock
+from typing import List
+
+
+class XRTRunner:
+    def __init__(
+        self,
+        verbose: bool = False,
+        experimental_passes: bool = True,
+        omit_while_true_loop: bool = True,
+    ):
+        self.verbose = verbose
+        self.experimental_passes = experimental_passes
+        self.omit_while_true_loop = omit_while_true_loop
+
+    def run_test(
+        self,
+        mlir_module: np.ndarray,
+        inputs: List[np.ndarray],
+        expected_outputs: List[np.ndarray],
+    ):
+        if self.verbose:
+            print("Running module: ")
+            print(mlir_module)
+
+        backend = XRTBackend(
+            verbose=self.verbose,
+            experimental_passes=self.experimental_passes,
+            omit_while_true_loop=self.omit_while_true_loop,
+        )
+
+        # run the module - slots are input/output for now, assume non-overlapping inputs/outputs
+        inputs += [np.empty((0, 0)) for _o in expected_outputs]
+        with filelock.FileLock("/tmp/npu.lock"):
+            module_function = backend.compile_and_load(mlir_module)
+            actual_outputs = module_function(*inputs)
+
+        backend.unload()
+
+        # Remove input slots from the received outputs
+        actual_outputs = actual_outputs[len(inputs) :]
+
+        if self._check_outputs(actual_outputs, expected_outputs):
+            print("PASS!")
+            return_code = 0
+        else:
+            print("failed.")
+            return_code = -1
+        return return_code
+
+    def _check_outputs(
+        self, actual_outputs: List[np.ndarray], expected_outputs: List[np.ndarray]
+    ):
+        assert (
+            len(actual_outputs) == len(expected_outputs),
+            "Number of actual outputs does not equal number of expected outputs",
+        )
+
+        for i, (actual, expected) in enumerate(zip(actual_outputs, expected_outputs)):
+
+            # TODO: may need to reshape??
+            assert (
+                actual.size() == expected.size(),
+                f"Actual output size {actual.size()} does not meet expected output size {expected.size()}",
+            )
+
+            if not np.ndarray.array_equal(actual, expected):
+                print(f"ERROR: Output {i} does not meet expected output.")
+                print("Expected: ")
+                if len(expected.size()) == 2:
+                    print(np.asmatrix(expected))
+                else:
+                    print(expected)
+                print("Actual: ")
+                if len(actual.size()) == 2:
+                    print(np.asmatrix(actual))
+                else:
+                    print(actual)
+                return False
+        return True

From 50e020a1a3f76fb6dd858e7438a19fab22460ec9 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 14:50:15 -0600
Subject: [PATCH 02/31] Fixup a few bugs

---
 python/air/backend/xrt_runner.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 2124029fb..55ed35a68 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -37,10 +37,10 @@ def run_test(
         )
 
         # run the module - slots are input/output for now, assume non-overlapping inputs/outputs
-        inputs += [np.empty((0, 0)) for _o in expected_outputs]
+        expanded_inputs = inputs + expected_outputs
         with filelock.FileLock("/tmp/npu.lock"):
             module_function = backend.compile_and_load(mlir_module)
-            actual_outputs = module_function(*inputs)
+            actual_outputs = module_function(*expanded_inputs)
 
         backend.unload()
 
@@ -58,30 +58,30 @@ def run_test(
     def _check_outputs(
         self, actual_outputs: List[np.ndarray], expected_outputs: List[np.ndarray]
     ):
-        assert (
-            len(actual_outputs) == len(expected_outputs),
-            "Number of actual outputs does not equal number of expected outputs",
-        )
+        assert len(actual_outputs) == len(
+            expected_outputs
+        ), f"Number of actual outputs ({len(actual_outputs)}) does not equal number of expected outputs ({len(expected_outputs)})"
 
         for i, (actual, expected) in enumerate(zip(actual_outputs, expected_outputs)):
 
-            # TODO: may need to reshape??
+            # TODO: may need to reshape for this to be true?
             assert (
-                actual.size() == expected.size(),
-                f"Actual output size {actual.size()} does not meet expected output size {expected.size()}",
-            )
+                actual.shape == expected.shape
+            ), f"Actual output shape {actual.shape} does not meet expected output shape {expected.shape}"
 
-            if not np.ndarray.array_equal(actual, expected):
-                print(f"ERROR: Output {i} does not meet expected output.")
+            if self.verbose:
                 print("Expected: ")
-                if len(expected.size()) == 2:
+                if len(expected.shape) == 2:
                     print(np.asmatrix(expected))
                 else:
                     print(expected)
                 print("Actual: ")
-                if len(actual.size()) == 2:
+                if len(actual.shape) == 2:
                     print(np.asmatrix(actual))
                 else:
                     print(actual)
+
+            if not np.array_equal(actual, expected):
+                print(f"ERROR: Output {i} does not meet expected output.")
                 return False
         return True

From 155fc27e0ff68eb025419976e6e54d1e735fc41f Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 14:53:40 -0600
Subject: [PATCH 03/31] migrate passthrough_channel to xrtrunner

---
 .../passthrough/passthrough_channel/Makefile  |  7 +-
 .../passthrough_channel.py                    | 71 +++++++++++++------
 .../passthrough/passthrough_channel/run.py    | 43 -----------
 3 files changed, 56 insertions(+), 65 deletions(-)
 delete mode 100644 programming_examples/passthrough/passthrough_channel/run.py

diff --git a/programming_examples/passthrough/passthrough_channel/Makefile b/programming_examples/passthrough/passthrough_channel/Makefile
index 122cb0624..c5d3d82dc 100644
--- a/programming_examples/passthrough/passthrough_channel/Makefile
+++ b/programming_examples/passthrough/passthrough_channel/Makefile
@@ -6,9 +6,12 @@ targetname := $(shell basename ${srcdir})
 
 all: run
 
+print:
+	${powershell} python3 ${srcdir}/passthrough_channel.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/passthrough_channel.py
 
 clean:
 	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
index 503dbf36f..93c7265bf 100644
--- a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
+++ b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
@@ -1,36 +1,27 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
-from common import *
+INOUT_DATATYPE = np.uint8
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 
 
 @module_builder
-def build_module(vector_size):
-    assert vector_size % NUM_VECTORS == 0
+def build_module(vector_size, num_subvectors):
+    assert vector_size % num_subvectors == 0
 
     # chop input in 4 sub-tensors
-    lineWidthInBytes = vector_size // NUM_VECTORS
+    lineWidthInBytes = vector_size // num_subvectors
 
     # Type and method of input/output
     memrefTyInOut = T.memref(vector_size, T.ui8())
@@ -66,7 +57,7 @@ def segment_body():
                 @herd(name="copyherd", sizes=[1, 1])
                 def herd_body(tx, ty, sx, sy):
 
-                    for _i in range_(NUM_VECTORS):
+                    for _i in range_(num_subvectors):
                         # We must allocate a buffer of image size for the input/output
                         tensor_in = AllocOp(tensor_type, [], [])
                         tensor_out = AllocOp(tensor_type, [], [])
@@ -90,5 +81,45 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-s",
+        "--vector_size",
+        type=int,
+        default=4096,
+        help="The size (in bytes) of the data vector to passthrough",
+    )
+    parser.add_argument(
+        "--subvector_size",
+        type=int,
+        default=4,
+        help="The number of sub-vectors to break the vector into",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(args.vector_size, args.subvector_size)
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    output_b = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    for i in range(args.vector_size):
+        input_a[i] = i % 0xFF
+        output_b[i] = i % 0xFF
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/passthrough/passthrough_channel/run.py b/programming_examples/passthrough/passthrough_channel/run.py
deleted file mode 100644
index 82a5b07da..000000000
--- a/programming_examples/passthrough/passthrough_channel/run.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from common import test_main
-from passthrough_channel.passthrough_channel import build_module
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough/passthrough_channel example",
-    )
-    parser.add_argument(
-        "-s",
-        "--vector_size",
-        type=int,
-        default=4096,
-        help="The size (in bytes) of the data vector to passthrough",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(
-        build_module, args.vector_size, experimental_passes=True, verbose=args.verbose
-    )

From 298c0bc90b7e7fccd77f887c993525ee2135b1ec Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 14:57:17 -0600
Subject: [PATCH 04/31] migrate passthrough_kernel to use xrtrunner

---
 .../passthrough/passthrough_kernel/Makefile   |  9 ++-
 .../passthrough_kernel/passthrough_kernel.py  | 71 +++++++++++++------
 .../passthrough/passthrough_kernel/run.py     | 43 -----------
 3 files changed, 57 insertions(+), 66 deletions(-)
 delete mode 100644 programming_examples/passthrough/passthrough_kernel/run.py

diff --git a/programming_examples/passthrough/passthrough_kernel/Makefile b/programming_examples/passthrough/passthrough_kernel/Makefile
index 2e0ca87ff..5408373e7 100644
--- a/programming_examples/passthrough/passthrough_kernel/Makefile
+++ b/programming_examples/passthrough/passthrough_kernel/Makefile
@@ -19,13 +19,16 @@ VPATH := ${MLIR_AIE_DIR}/aie_kernels/generic
 
 all: run
 
+print:
+	${powershell} python3 ${srcdir}/passthrough_kernel.py -p
+
 build/passThrough.cc.o: ${VPATH}/passThrough.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
 
 run: build/passThrough.cc.o
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/passthrough_kernel.py
 
 clean:
-	rm -rf ${srcdir}/build ${srcdir}/__pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
index 4e2f88457..5cf401b24 100644
--- a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
+++ b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
@@ -1,36 +1,27 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
-from common import *
+INOUT_DATATYPE = np.uint8
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 
 
 @module_builder
-def build_module(vector_size):
-    assert vector_size % NUM_VECTORS == 0
+def build_module(vector_size, num_subvectors):
+    assert vector_size % num_subvectors == 0
 
     # chop input in 4 sub-tensors
-    lineWidthInBytes = vector_size // NUM_VECTORS
+    lineWidthInBytes = vector_size // num_subvectors
 
     # Type and method of input/output
     memrefTyInOut = T.memref(vector_size, T.ui8())
@@ -70,7 +61,7 @@ def segment_body():
                 @herd(name="copyherd", sizes=[1, 1], link_with="passThrough.cc.o")
                 def herd_body(tx, ty, sx, sy):
 
-                    for i in range_(NUM_VECTORS):
+                    for i in range_(num_subvectors):
                         # We must allocate a buffer of image size for the input/output
                         tensor_in = AllocOp(tensor_type, [], [])
                         tensor_out = AllocOp(tensor_type, [], [])
@@ -92,5 +83,45 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-s",
+        "--vector_size",
+        type=int,
+        default=4096,
+        help="The size (in bytes) of the data vector to passthrough",
+    )
+    parser.add_argument(
+        "--subvector_size",
+        type=int,
+        default=4,
+        help="The number of sub-vectors to break the vector into",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(args.vector_size, args.subvector_size)
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    output_b = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    for i in range(args.vector_size):
+        input_a[i] = i % 0xFF
+        output_b[i] = i % 0xFF
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/passthrough/passthrough_kernel/run.py b/programming_examples/passthrough/passthrough_kernel/run.py
deleted file mode 100644
index 2b6008141..000000000
--- a/programming_examples/passthrough/passthrough_kernel/run.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from common import test_main
-from passthrough_kernel.passthrough_kernel import build_module
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough/passthrough_kernel example",
-    )
-    parser.add_argument(
-        "-s",
-        "--vector_size",
-        type=int,
-        default=4096,
-        help="The size (in bytes) of the data vector to passthrough",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(
-        build_module, args.vector_size, experimental_passes=True, verbose=args.verbose
-    )

From 7d3f247016abfc642fa65092ddc77b8af869edcb Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 16:33:29 -0600
Subject: [PATCH 05/31] Fix up passthrough kernel structure and comments

---
 .../passthrough_kernel/passthrough_kernel.py  | 30 ++++++++-----------
 python/air/backend/xrt_runner.py              | 30 +++++++++++++++++++
 2 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
index 5cf401b24..7d0742b11 100644
--- a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
+++ b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
@@ -8,61 +8,55 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 INOUT_DATATYPE = np.uint8
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 
 
 @module_builder
 def build_module(vector_size, num_subvectors):
     assert vector_size % num_subvectors == 0
-
-    # chop input in 4 sub-tensors
-    lineWidthInBytes = vector_size // num_subvectors
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
 
     # Type and method of input/output
-    memrefTyInOut = T.memref(vector_size, T.ui8())
+    memrefTyInOut = T.memref(vector_size, xrt_dtype)
     ChannelOp("ChanIn")
     ChannelOp("ChanOut")
 
-    # We want to store our data in L1 memory
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    # The compute core splits input into subvectors for processing
+    lineWidthInBytes = vector_size // num_subvectors
 
-    # This is the type definition of the image
+    # Memref type definition used by the compute core and external function
+    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
     tensor_type = MemRefType.get(
         shape=[lineWidthInBytes],
-        element_type=T.ui8(),
+        element_type=xrt_dtype,
         memory_space=mem_space,
     )
 
+    # Function definition of the external function we will call
     passThroughLine = external_func(
         "passThroughLine", inputs=[tensor_type, tensor_type, T.i32()]
     )
 
-    # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
     def copy(arg0, arg1):
 
-        # The arguments are the input and output
         @launch(operands=[arg0, arg1])
         def launch_body(a, b):
             ChannelPut("ChanIn", a)
             ChannelGet("ChanOut", b)
 
-            # The arguments are still the input and the output
             @segment(name="seg")
             def segment_body():
 
-                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
-                # We just need one compute core, so we ask for a 1x1 herd
                 @herd(name="copyherd", sizes=[1, 1], link_with="passThrough.cc.o")
-                def herd_body(tx, ty, sx, sy):
+                def herd_body(_tx, _ty, _sx, _sy):
 
-                    for i in range_(num_subvectors):
-                        # We must allocate a buffer of image size for the input/output
+                    # Process each subvector individually
+                    for _i in range_(num_subvectors):
                         tensor_in = AllocOp(tensor_type, [], [])
                         tensor_out = AllocOp(tensor_type, [], [])
 
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 55ed35a68..45c093f7e 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -5,8 +5,38 @@
 
 import numpy as np
 from .xrt import XRTBackend
+from air.dialects.air import *
 import filelock
 from typing import List
+from collections import defaultdict
+
+TYPE_MAP_DICT = defaultdict(
+    lambda: None,
+    {
+        np.uint8: T.ui8,
+        # TODO: add more mappings here
+    },
+)
+
+
+def type_mapper(np_dtype):
+    """
+    This function is meant to run within a module context (e.g., with a function wrapped with @build_module)
+    args:
+        np_dtype: the numpy data type to map
+    return:
+        The data type to run on the npu
+    """
+    xrt_dtype = TYPE_MAP_DICT[np_dtype]()
+
+    if xrt_dtype is None:
+        raise AirBackendError(f"numpy data type {np_dtype} has no default mapping")
+    elif xrt_dtype.width / 8 != np.dtype(np_dtype).itemsize:
+        # This is a sanity check on the TYPE_MAP_DICT rather than a check on the user input
+        raise AirBackendError(
+            f"Python data type has width {xrt_dtype.width / 8} but numpy data type has width {np.dtype(np_dtype).itemsize}"
+        )
+    return xrt_dtype
 
 
 class XRTRunner:

From d13e14fa9dd5435ba46f5b45556b3be5890ba5c9 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 16:52:26 -0600
Subject: [PATCH 06/31] clean up other passthrough examples

---
 .../passthrough_channel.py                    | 31 +++++++--------
 .../passthrough_dma/passthrough_dma.py        | 39 +++++++++----------
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
index 93c7265bf..b7f058dce 100644
--- a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
+++ b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
@@ -8,55 +8,54 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 INOUT_DATATYPE = np.uint8
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 
 
 @module_builder
 def build_module(vector_size, num_subvectors):
     assert vector_size % num_subvectors == 0
-
-    # chop input in 4 sub-tensors
-    lineWidthInBytes = vector_size // num_subvectors
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
 
     # Type and method of input/output
-    memrefTyInOut = T.memref(vector_size, T.ui8())
+    memrefTyInOut = T.memref(vector_size, xrt_dtype)
     ChannelOp("ChanIn")
     ChannelOp("ChanOut")
 
-    # We want to store our data in L1 memory
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    # The compute core splits input into subvectors for processing
+    lineWidthInBytes = vector_size // num_subvectors
 
-    # This is the type definition of the image
+    # Memref type definition used by the compute core and external function
+    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
     tensor_type = MemRefType.get(
         shape=[lineWidthInBytes],
-        element_type=T.ui8(),
+        element_type=xrt_dtype,
         memory_space=mem_space,
     )
 
-    # We will send an image worth of data in and out
+    # Function definition of the external function we will call
+    passThroughLine = external_func(
+        "passThroughLine", inputs=[tensor_type, tensor_type, T.i32()]
+    )
+
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
     def copy(arg0, arg1):
 
-        # The arguments are the input and output
         @launch(operands=[arg0, arg1])
         def launch_body(a, b):
             ChannelPut("ChanIn", a)
             ChannelGet("ChanOut", b)
 
-            # The arguments are still the input and the output
             @segment(name="seg")
             def segment_body():
 
-                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
-                # We just need one compute core, so we ask for a 1x1 herd
                 @herd(name="copyherd", sizes=[1, 1])
-                def herd_body(tx, ty, sx, sy):
+                def herd_body(_tx, _ty, _sx, _sy):
 
+                    # Process each subvector individually
                     for _i in range_(num_subvectors):
                         # We must allocate a buffer of image size for the input/output
                         tensor_in = AllocOp(tensor_type, [], [])
diff --git a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
index e14cb97dc..567cccb63 100644
--- a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
+++ b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
@@ -8,58 +8,57 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 INOUT_DATATYPE = np.uint8
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 
 
 @module_builder
 def build_module(vector_size, num_subvectors):
     assert vector_size % num_subvectors == 0
-
-    # chop input in 4 sub-tensors
-    lineWidthInBytes = vector_size // num_subvectors
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
 
     # Type and method of input/output
-    memrefTyInOut = T.memref(vector_size, T.ui8())
+    memrefTyInOut = T.memref(vector_size, xrt_dtype)
 
-    # We want to store our data in L1 memory
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    # The compute core splits input into subvectors for processing
+    lineWidthInBytes = vector_size // num_subvectors
 
-    # This is the type definition of the image
+    # Memref type definition used by the compute core and external function
+    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
     tensor_type = MemRefType.get(
         shape=[lineWidthInBytes],
-        element_type=T.ui8(),
+        element_type=xrt_dtype,
         memory_space=mem_space,
     )
 
-    # We will send an image worth of data in and out
+    # Function definition of the external function we will call
+    passThroughLine = external_func(
+        "passThroughLine", inputs=[tensor_type, tensor_type, T.i32()]
+    )
+
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
     def copy(arg0, arg1):
 
-        # The arguments are the input and output
         @launch(operands=[arg0, arg1])
         def launch_body(a, b):
 
-            # The arguments are still the input and the output
             @segment(name="seg", operands=[a, b])
-            def segment_body(arg0, arg1):
+            def segment_body(arg2, arg3):
 
-                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
-                # We just need one compute core, so we ask for a 1x1 herd
-                @herd(name="copyherd", sizes=[1, 1], operands=[arg0, arg1])
-                def herd_body(tx, ty, sx, sy, a, b):
+                @herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3])
+                def herd_body(_tx, _ty, _sx, _sy, c, d):
 
+                    # Process each subvector individually
                     for _i in range_(num_subvectors):
                         # We must allocate a buffer of image size for the input/output
                         tensor_in = AllocOp(tensor_type, [], [])
                         tensor_out = AllocOp(tensor_type, [], [])
 
                         # Place the input image (a) into the L1 memory region
-                        dma_memcpy_nd(tensor_in, a)
+                        dma_memcpy_nd(tensor_in, c)
 
                         for j in range_(lineWidthInBytes):
                             # Load the input value
@@ -69,7 +68,7 @@ def herd_body(tx, ty, sx, sy, a, b):
                             store(val, tensor_out, [j])
                             yield_([])
 
-                        dma_memcpy_nd(b, tensor_out)
+                        dma_memcpy_nd(d, tensor_out)
 
                         # Deallocate our L1 buffers
                         DeallocOp(tensor_in)

From 273ffa9237a57ae43669afb75233dda53062012c Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 17:33:55 -0600
Subject: [PATCH 07/31] Continue cleaning up passthrough example

---
 .../passthrough_channel/passthrough_channel.py         |  4 ++--
 .../passthrough/passthrough_dma/passthrough_dma.py     |  4 ++--
 .../passthrough/passthrough_kernel/Makefile            | 10 +++++-----
 .../passthrough_kernel/passthrough_kernel.py           |  4 ++--
 python/air/backend/xrt_runner.py                       |  7 ++-----
 5 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
index b7f058dce..776576fa1 100644
--- a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
+++ b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
@@ -114,8 +114,8 @@ def herd_body(_tx, _ty, _sx, _sy):
         print(mlir_module)
         exit(0)
 
-    input_a = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
-    output_b = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    input_a = np.zeros(shape=(args.vector_size), dtype=INOUT_DATATYPE)
+    output_b = np.zeros(shape=(args.vector_size), dtype=INOUT_DATATYPE)
     for i in range(args.vector_size):
         input_a[i] = i % 0xFF
         output_b[i] = i % 0xFF
diff --git a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
index 567cccb63..0ad75c6e3 100644
--- a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
+++ b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
@@ -111,8 +111,8 @@ def herd_body(_tx, _ty, _sx, _sy, c, d):
         print(mlir_module)
         exit(0)
 
-    input_a = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
-    output_b = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    input_a = np.zeros(shape=(args.vector_size), dtype=INOUT_DATATYPE)
+    output_b = np.zeros(shape=(args.vector_size), dtype=INOUT_DATATYPE)
     for i in range(args.vector_size):
         input_a[i] = i % 0xFF
         output_b[i] = i % 0xFF
diff --git a/programming_examples/passthrough/passthrough_kernel/Makefile b/programming_examples/passthrough/passthrough_kernel/Makefile
index 5408373e7..bd5df8d8e 100644
--- a/programming_examples/passthrough/passthrough_kernel/Makefile
+++ b/programming_examples/passthrough/passthrough_kernel/Makefile
@@ -20,13 +20,13 @@ VPATH := ${MLIR_AIE_DIR}/aie_kernels/generic
 all: run
 
 print:
-	${powershell} python3 ${srcdir}/passthrough_kernel.py -p
+	${powershell} python3 ${srcdir}/passthrough_dma.py -p
 
-build/passThrough.cc.o: ${VPATH}/passThrough.cc
-	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+${srcdir}/build/passThrough.cc.o: ${VPATH}/passThrough.cc
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
 
-run: build/passThrough.cc.o
+run: ${srcdir}/build/passThrough.cc.o
 	mkdir -p ${srcdir}/build
 	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/passthrough_kernel.py
 
diff --git a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
index 7d0742b11..7268cfadb 100644
--- a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
+++ b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
@@ -111,8 +111,8 @@ def herd_body(_tx, _ty, _sx, _sy):
         print(mlir_module)
         exit(0)
 
-    input_a = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
-    output_b = np.arange(1, args.vector_size + 1, dtype=INOUT_DATATYPE)
+    input_a = np.zeros(shape=(args.vector_size), dtype=INOUT_DATATYPE)
+    output_b = np.zeros(shape=(args.vector_size), dtype=INOUT_DATATYPE)
     for i in range(args.vector_size):
         input_a[i] = i % 0xFF
         output_b[i] = i % 0xFF
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 45c093f7e..995a46a33 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -91,13 +91,10 @@ def _check_outputs(
         assert len(actual_outputs) == len(
             expected_outputs
         ), f"Number of actual outputs ({len(actual_outputs)}) does not equal number of expected outputs ({len(expected_outputs)})"
+        np.set_printoptions(formatter={"int": hex})
 
         for i, (actual, expected) in enumerate(zip(actual_outputs, expected_outputs)):
-
-            # TODO: may need to reshape for this to be true?
-            assert (
-                actual.shape == expected.shape
-            ), f"Actual output shape {actual.shape} does not meet expected output shape {expected.shape}"
+            actual = np.reshape(actual, expected.shape)
 
             if self.verbose:
                 print("Expected: ")

From 5784a40229b6ac90b63fb9b80708334b40fe9623 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 18:20:38 -0600
Subject: [PATCH 08/31] Start to clean up matrix scalar add

---
 .../single_core_dma/Makefile                  |  11 +-
 .../matrix_scalar_add/single_core_dma/run.py  |  35 -----
 .../single_core_dma/single_core_dma.py        | 126 +++++++++++++-----
 python/air/backend/xrt_runner.py              |   5 +-
 4 files changed, 103 insertions(+), 74 deletions(-)
 delete mode 100644 programming_examples/matrix_scalar_add/single_core_dma/run.py

diff --git a/programming_examples/matrix_scalar_add/single_core_dma/Makefile b/programming_examples/matrix_scalar_add/single_core_dma/Makefile
index 77dc865ad..53add91cd 100644
--- a/programming_examples/matrix_scalar_add/single_core_dma/Makefile
+++ b/programming_examples/matrix_scalar_add/single_core_dma/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/single_core_dma.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/single_core_dma.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/matrix_scalar_add/single_core_dma/run.py b/programming_examples/matrix_scalar_add/single_core_dma/run.py
deleted file mode 100644
index 779893e20..000000000
--- a/programming_examples/matrix_scalar_add/single_core_dma/run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from single_core_dma.single_core_dma import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, experimental_passes=True, verbose=args.verbose)
diff --git a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
index 2b1c03575..f5e9a308e 100644
--- a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
+++ b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
@@ -1,33 +1,26 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
-from common import *
-
 
 @module_builder
-def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
+    assert image_height % tile_height == 0
+    assert image_width % tile_width == 0
+    image_size = [image_height, image_width]
+    tile_size = [tile_height, tile_width]
+    xrt_dtype = type_mapper(np_dtype)
+
+    memrefTyInOut = MemRefType.get(image_size, xrt_dtype)
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
@@ -45,28 +38,27 @@ def segment_body(arg2, arg3):
                 # We just need one compute core, so we ask for a 1x1 herd
                 @herd(name="xaddherd", sizes=[1, 1], operands=[arg2, arg3])
                 def herd_body(_tx, _ty, _sx, _sy, a, b):
-
                     # We want to store our data in L1 memory
                     mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
 
                     # This is the type definition of the tile
                     tile_type = MemRefType.get(
-                        shape=TILE_SIZE,
+                        shape=tile_size,
                         element_type=T.i32(),
                         memory_space=mem_space,
                     )
 
                     # Loop over columns and rows of tiles
-                    for tile_index0 in range_(IMAGE_HEIGHT // TILE_HEIGHT):
-                        for tile_index1 in range_(IMAGE_WIDTH // TILE_WIDTH):
+                    for tile_index0 in range_(image_height // tile_height):
+                        for tile_index1 in range_(image_width // tile_width):
 
                             # We must allocate a buffer of tile size for the input/output
                             tile_in = AllocOp(tile_type, [], [])
                             tile_out = AllocOp(tile_type, [], [])
 
                             # Convert the type of the tile size variable to the Index type
-                            tile_size0 = arith.ConstantOp.create_index(TILE_HEIGHT)
-                            tile_size1 = arith.ConstantOp.create_index(TILE_WIDTH)
+                            tile_size0 = arith.ConstantOp.create_index(tile_height)
+                            tile_size1 = arith.ConstantOp.create_index(tile_width)
 
                             # Calculate the offset into the channel data, which is based on our loop vars
                             offset0 = arith.MulIOp(tile_size0, tile_index0)
@@ -74,7 +66,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
                             tile_num = arith.MulIOp(
                                 tile_index0,
                                 arith.ConstantOp.create_index(
-                                    IMAGE_WIDTH // TILE_WIDTH
+                                    image_width // tile_width
                                 ),
                             )
                             tile_num = arith.AddIOp(tile_num, tile_index1)
@@ -84,23 +76,23 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
                                 tile_in,
                                 a,
                                 src_offsets=[offset0, offset1],
-                                src_sizes=[TILE_HEIGHT, TILE_WIDTH],
-                                src_strides=[IMAGE_WIDTH, 1],
+                                src_sizes=[tile_height, tile_width],
+                                src_strides=[image_width, 1],
                             )
 
                             # Access every value in the tile
-                            for j in range_(TILE_HEIGHT):
-                                for i in range_(TILE_WIDTH):
+                            for i in range_(tile_height):
+                                for j in range_(tile_width):
                                     # Load the input value from tile_in
-                                    val_in = load(tile_in, [j, i])
+                                    val_in = load(tile_in, [i, j])
 
                                     # Compute the output value
                                     val_out = arith.addi(
-                                        val_in, arith.index_cast(T.i32(), tile_num)
+                                        val_in, arith.index_cast(xrt_dtype, tile_num)
                                     )
 
                                     # Store the output value in tile_out
-                                    store(val_out, tile_out, [j, i])
+                                    store(val_out, tile_out, [i, j])
                                     yield_([])
                                 yield_([])
 
@@ -109,8 +101,8 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
                                 b,
                                 tile_out,
                                 dst_offsets=[offset0, offset1],
-                                dst_sizes=[TILE_HEIGHT, TILE_WIDTH],
-                                dst_strides=[IMAGE_WIDTH, 1],
+                                dst_sizes=[tile_height, tile_width],
+                                dst_strides=[image_width, 1],
                             )
 
                             # Deallocate our L1 buffers
@@ -122,5 +114,69 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    # Default values.
+    IMAGE_WIDTH = 16
+    IMAGE_HEIGHT = 32
+    TILE_WIDTH = 8
+    TILE_HEIGHT = 16
+    INOUT_DATATYPE = np.uint32
+
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=IMAGE_HEIGHT,
+        help="Height of the image data",
+    )
+    parser.add_argument(
+        "--image-width", type=int, default=IMAGE_WIDTH, help="Width of the image data"
+    )
+    parser.add_argument(
+        "--tile-height", type=int, default=TILE_HEIGHT, help="Height of the tile data"
+    )
+    parser.add_argument(
+        "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.image_height,
+        args.image_width,
+        args.tile_height,
+        args.tile_width,
+        INOUT_DATATYPE,
+    )
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    output_b = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    for i in range(args.image_height):
+        for j in range(args.image_width):
+            input_a[i, j] = i * args.image_height + j
+            tile_num = (
+                i // args.tile_height * (args.image_width // args.tile_width)
+                + j // args.tile_width
+            )
+            output_b[i, j] = input_a[i, j] + tile_num
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 995a46a33..e4c88004a 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -14,6 +14,7 @@
     lambda: None,
     {
         np.uint8: T.ui8,
+        np.uint32: T.i32,
         # TODO: add more mappings here
     },
 )
@@ -67,7 +68,9 @@ def run_test(
         )
 
         # run the module - slots are input/output for now, assume non-overlapping inputs/outputs
-        expanded_inputs = inputs + expected_outputs
+        expanded_inputs = inputs + [
+            np.zeros(o.shape, o.dtype) for o in expected_outputs
+        ]
         with filelock.FileLock("/tmp/npu.lock"):
             module_function = backend.compile_and_load(mlir_module)
             actual_outputs = module_function(*expanded_inputs)

From c1d8893658d7aa4d78c89fa4817d7d8e7da3d37a Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 18:27:18 -0600
Subject: [PATCH 09/31] Continue fixing up matrix scalar add example

---
 .../matrix_scalar_add/common.py               |  89 -------------
 .../single_core_channel/Makefile              |  11 +-
 .../single_core_channel/run.py                |  35 -----
 .../single_core_channel.py                    | 123 +++++++++++++-----
 4 files changed, 98 insertions(+), 160 deletions(-)
 delete mode 100644 programming_examples/matrix_scalar_add/common.py
 delete mode 100644 programming_examples/matrix_scalar_add/single_core_channel/run.py

diff --git a/programming_examples/matrix_scalar_add/common.py b/programming_examples/matrix_scalar_add/common.py
deleted file mode 100644
index 1c5cfbf59..000000000
--- a/programming_examples/matrix_scalar_add/common.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: MIT
-
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-IMAGE_WIDTH = 16
-IMAGE_HEIGHT = 32
-IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
-
-TILE_WIDTH = 8
-TILE_HEIGHT = 16
-TILE_SIZE = [TILE_HEIGHT, TILE_WIDTH]
-
-assert IMAGE_WIDTH % TILE_WIDTH == 0
-assert IMAGE_HEIGHT % TILE_HEIGHT == 0
-
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, experimental_passes, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        input_b[i] = 0x00DEFACED
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose,
-        omit_while_true_loop=True,
-        experimental_passes=experimental_passes,
-    )
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-
-        row = i // IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-        tile_num = (row // TILE_HEIGHT) * (IMAGE_WIDTH // TILE_WIDTH) + (
-            col // TILE_WIDTH
-        )
-
-        # value should have been updated
-        expected_value = 0x1000 + i + tile_num
-        if not (rb == expected_value):
-            """
-            print(
-                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/Makefile b/programming_examples/matrix_scalar_add/single_core_channel/Makefile
index 77dc865ad..06824cd1f 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/Makefile
+++ b/programming_examples/matrix_scalar_add/single_core_channel/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/single_core_channel.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/single_core_channel.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/run.py b/programming_examples/matrix_scalar_add/single_core_channel/run.py
deleted file mode 100644
index 3925585e5..000000000
--- a/programming_examples/matrix_scalar_add/single_core_channel/run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from single_core_channel.single_core_channel import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, experimental_passes=True, verbose=args.verbose)
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
index d2e3ff8be..29e8cccf6 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
+++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
@@ -1,33 +1,26 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
-from common import *
-
 
 @module_builder
-def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
+    assert image_height % tile_height == 0
+    assert image_width % tile_width == 0
+    image_size = [image_height, image_width]
+    tile_size = [tile_height, tile_width]
+    xrt_dtype = type_mapper(np_dtype)
+
+    memrefTyInOut = MemRefType.get(image_size, xrt_dtype)
 
     # Create two channels which will send/receive the
     # input/output data respectively
@@ -43,11 +36,11 @@ def copy(arg0, arg1):
         def launch_body(a, b):
 
             # Transform data into contiguous tiles
-            for tile_index0 in range_(IMAGE_HEIGHT // TILE_HEIGHT):
-                for tile_index1 in range_(IMAGE_WIDTH // TILE_WIDTH):
+            for tile_index0 in range_(image_height // tile_height):
+                for tile_index1 in range_(image_width // tile_width):
                     # Convert the type of the tile size variable to the Index type
-                    tile_size0 = arith.ConstantOp.create_index(TILE_HEIGHT)
-                    tile_size1 = arith.ConstantOp.create_index(TILE_WIDTH)
+                    tile_size0 = arith.ConstantOp.create_index(tile_height)
+                    tile_size1 = arith.ConstantOp.create_index(tile_width)
 
                     # Calculate the offset into the channel data, which is based on which tile index
                     # we are at using tile_index0 and tile_index1 (our loop vars).
@@ -61,8 +54,8 @@ def launch_body(a, b):
                         "ChanIn",
                         a,
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
-                        strides=[IMAGE_WIDTH, 1],
+                        sizes=[tile_height, tile_width],
+                        strides=[image_width, 1],
                     )
 
                     # Write data back out to the channel tile by tile
@@ -70,8 +63,8 @@ def launch_body(a, b):
                         "ChanOut",
                         b,
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
-                        strides=[IMAGE_WIDTH, 1],
+                        sizes=[tile_height, tile_width],
+                        strides=[image_width, 1],
                     )
                     yield_([])
                 yield_([])
@@ -90,14 +83,14 @@ def herd_body(_tx, _ty, _sx, _sy):
 
                     # This is the type definition of the tile
                     tile_type = MemRefType.get(
-                        shape=TILE_SIZE,
-                        element_type=T.i32(),
+                        shape=tile_size,
+                        element_type=xrt_dtype,
                         memory_space=mem_space,
                     )
 
                     # Loop over columns and rows of tiles
                     for tile_num in range_(
-                        (IMAGE_WIDTH // TILE_WIDTH) * (IMAGE_HEIGHT // TILE_HEIGHT)
+                        (image_width // tile_width) * (image_height // tile_height)
                     ):
 
                         # We must allocate a buffer of tile size for the input/output
@@ -108,14 +101,14 @@ def herd_body(_tx, _ty, _sx, _sy):
                         ChannelGet("ChanIn", tile_in)
 
                         # Access every value in the tile
-                        for j in range_(TILE_HEIGHT):
-                            for i in range_(TILE_WIDTH):
+                        for j in range_(tile_height):
+                            for i in range_(tile_width):
                                 # Load the input value from tile_in
                                 val_in = load(tile_in, [j, i])
 
                                 # Compute the output value
                                 val_out = arith.addi(
-                                    val_in, arith.index_cast(T.i32(), tile_num)
+                                    val_in, arith.index_cast(xrt_dtype, tile_num)
                                 )
 
                                 # Store the output value in tile_out
@@ -138,5 +131,69 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    # Default values.
+    IMAGE_WIDTH = 16
+    IMAGE_HEIGHT = 32
+    TILE_WIDTH = 8
+    TILE_HEIGHT = 16
+    INOUT_DATATYPE = np.uint32
+
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=IMAGE_HEIGHT,
+        help="Height of the image data",
+    )
+    parser.add_argument(
+        "--image-width", type=int, default=IMAGE_WIDTH, help="Width of the image data"
+    )
+    parser.add_argument(
+        "--tile-height", type=int, default=TILE_HEIGHT, help="Height of the tile data"
+    )
+    parser.add_argument(
+        "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.image_height,
+        args.image_width,
+        args.tile_height,
+        args.tile_width,
+        INOUT_DATATYPE,
+    )
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    output_b = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    for i in range(args.image_height):
+        for j in range(args.image_width):
+            input_a[i, j] = i * args.image_height + j
+            tile_num = (
+                i // args.tile_height * (args.image_width // args.tile_width)
+                + j // args.tile_width
+            )
+            output_b[i, j] = input_a[i, j] + tile_num
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))

From 41cdb311cbd87fe513fd111caa394af306f2b0e8 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 18:36:20 -0600
Subject: [PATCH 10/31] Get multi-launch example ported to new format

---
 .../multi_launch_channel/Makefile             |  11 +-
 .../multi_launch_channel.py                   | 127 +++++++++++++-----
 .../single_core_channel.py                    |  12 +-
 3 files changed, 102 insertions(+), 48 deletions(-)

diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/Makefile b/programming_examples/matrix_scalar_add/multi_launch_channel/Makefile
index 77dc865ad..54b469e65 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/Makefile
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/multi_launch_channel.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/multi_launch_channel.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index 471ad532a..f8dd2895e 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -1,18 +1,6 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
 
 from air.ir import *
 from air.dialects.air import *
@@ -20,15 +8,20 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.affine import apply as affine_apply
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
-from common import *
-
 
 @module_builder
-def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
+    assert image_height % tile_height == 0
+    assert image_width % tile_width == 0
+    image_size = [image_height, image_width]
+    tile_size = [tile_height, tile_width]
+    xrt_dtype = type_mapper(np_dtype)
+
+    memrefTyInOut = MemRefType.get(image_size, xrt_dtype)
 
     # Create two channels which will send/receive the
     # input/output data respectively
@@ -41,7 +34,7 @@ def copy(arg0, arg1):
 
         # The arguments are the input and output
         @launch(
-            sizes=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH],
+            sizes=[image_height // tile_height, image_width // tile_width],
             operands=[arg0, arg1],
         )
         def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
@@ -51,7 +44,7 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 [
                     AffineExpr.get_mul(
                         AffineSymbolExpr.get(0),
-                        AffineConstantExpr.get(TILE_HEIGHT),
+                        AffineConstantExpr.get(tile_height),
                     )
                 ],
             )
@@ -61,7 +54,7 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 [
                     AffineExpr.get_mul(
                         AffineSymbolExpr.get(0),
-                        AffineConstantExpr.get(TILE_WIDTH),
+                        AffineConstantExpr.get(tile_width),
                     )
                 ],
             )
@@ -73,8 +66,8 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 "ChanIn",
                 a,
                 offsets=[offset0, offset1],
-                sizes=[TILE_HEIGHT, TILE_WIDTH],
-                strides=[IMAGE_WIDTH, 1],
+                sizes=[tile_height, tile_width],
+                strides=[image_width, 1],
             )
 
             # Write data back out to the channel tile by tile
@@ -82,8 +75,8 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 "ChanOut",
                 b,
                 offsets=[offset0, offset1],
-                sizes=[TILE_HEIGHT, TILE_WIDTH],
-                strides=[IMAGE_WIDTH, 1],
+                sizes=[tile_height, tile_width],
+                strides=[image_width, 1],
             )
 
             # The arguments are still the input and the output
@@ -97,7 +90,7 @@ def herd_body(tx, ty, sx, sy, a, b):
 
                     # Loop over columns and rows of tiles
                     for tile_num in range_(
-                        (IMAGE_WIDTH // TILE_WIDTH) * (IMAGE_HEIGHT // TILE_HEIGHT)
+                        (image_width // tile_width) * (image_height // tile_height)
                     ):
 
                         # We want to store our data in L1 memory
@@ -105,8 +98,8 @@ def herd_body(tx, ty, sx, sy, a, b):
 
                         # This is the type definition of the tile
                         tile_type = MemRefType.get(
-                            shape=TILE_SIZE,
-                            element_type=T.i32(),
+                            shape=tile_size,
+                            element_type=xrt_dtype,
                             memory_space=mem_space,
                         )
 
@@ -118,22 +111,18 @@ def herd_body(tx, ty, sx, sy, a, b):
                         ChannelGet("ChanIn", tile_in)
 
                         # Access every value in the tile
-                        for j in range_(TILE_HEIGHT):
-                            for i in range_(TILE_WIDTH):
+                        for j in range_(tile_height):
+                            for i in range_(tile_width):
                                 # Load the input value from tile_in
                                 val_in = load(tile_in, [j, i])
 
                                 # Compute the output value TODO(hunhoffe): this is not correct, not sure how to percolate launch info here
                                 val_out = arith.addi(
-                                    val_in, arith.index_cast(T.i32(), tile_num)
+                                    val_in, arith.index_cast(xrt_dtype, tile_num)
                                 )
 
                                 # Store the output value in tile_out
-                                store(
-                                    val_out,
-                                    tile_out,
-                                    [j, i],
-                                )
+                                store(val_out, tile_out, [j, i])
                                 yield_([])
                             yield_([])
 
@@ -148,5 +137,69 @@ def herd_body(tx, ty, sx, sy, a, b):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    # Default values.
+    IMAGE_WIDTH = 16
+    IMAGE_HEIGHT = 32
+    TILE_WIDTH = 8
+    TILE_HEIGHT = 16
+    INOUT_DATATYPE = np.uint32
+
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=IMAGE_HEIGHT,
+        help="Height of the image data",
+    )
+    parser.add_argument(
+        "--image-width", type=int, default=IMAGE_WIDTH, help="Width of the image data"
+    )
+    parser.add_argument(
+        "--tile-height", type=int, default=TILE_HEIGHT, help="Height of the tile data"
+    )
+    parser.add_argument(
+        "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.image_height,
+        args.image_width,
+        args.tile_height,
+        args.tile_width,
+        INOUT_DATATYPE,
+    )
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    output_b = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    for i in range(args.image_height):
+        for j in range(args.image_width):
+            input_a[i, j] = i * args.image_height + j
+            tile_num = (
+                i // args.tile_height * (args.image_width // args.tile_width)
+                + j // args.tile_width
+            )
+            output_b[i, j] = input_a[i, j] + tile_num
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
index 29e8cccf6..b33815baa 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
+++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
@@ -101,10 +101,10 @@ def herd_body(_tx, _ty, _sx, _sy):
                         ChannelGet("ChanIn", tile_in)
 
                         # Access every value in the tile
-                        for j in range_(tile_height):
-                            for i in range_(tile_width):
+                        for i in range_(tile_height):
+                            for j in range_(tile_width):
                                 # Load the input value from tile_in
-                                val_in = load(tile_in, [j, i])
+                                val_in = load(tile_in, [i, j])
 
                                 # Compute the output value
                                 val_out = arith.addi(
@@ -112,11 +112,7 @@ def herd_body(_tx, _ty, _sx, _sy):
                                 )
 
                                 # Store the output value in tile_out
-                                store(
-                                    val_out,
-                                    tile_out,
-                                    [j, i],
-                                )
+                                store(val_out, tile_out, [i, j])
                                 yield_([])
                             yield_([])
 

From 9c88fd3845cb9bba703572eb98228923c1cf15b4 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 18:43:44 -0600
Subject: [PATCH 11/31] Clean up multi-core-dma; use tile_size where
 appropriate

---
 .../matrix_scalar_add/multi_core_dma/Makefile |  11 +-
 .../multi_core_dma/multi_core_dma.py          | 125 +++++++++++++-----
 .../matrix_scalar_add/multi_core_dma/run.py   |  35 -----
 .../multi_launch_channel.py                   |   4 +-
 .../single_core_channel.py                    |   4 +-
 .../single_core_dma/single_core_dma.py        |   4 +-
 6 files changed, 105 insertions(+), 78 deletions(-)
 delete mode 100644 programming_examples/matrix_scalar_add/multi_core_dma/run.py

diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/Makefile b/programming_examples/matrix_scalar_add/multi_core_dma/Makefile
index 77dc865ad..46f6859f6 100644
--- a/programming_examples/matrix_scalar_add/multi_core_dma/Makefile
+++ b/programming_examples/matrix_scalar_add/multi_core_dma/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/multi_core_dma.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/multi_core_dma.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
index 4905d3eee..9cfb01a55 100644
--- a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
+++ b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
@@ -1,18 +1,6 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
 
 from air.ir import *
 from air.dialects.air import *
@@ -20,15 +8,20 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.affine import apply as affine_apply
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
-from common import *
-
 
 @module_builder
-def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
+    assert image_height % tile_height == 0
+    assert image_width % tile_width == 0
+    image_size = [image_height, image_width]
+    tile_size = [tile_height, tile_width]
+    xrt_dtype = type_mapper(np_dtype)
+
+    memrefTyInOut = MemRefType.get(image_size, xrt_dtype)
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
@@ -46,17 +39,17 @@ def segment_body(arg2, arg3):
                 # We are hoping to map each tile to a different compute core.
                 @herd(
                     name="xaddherd",
-                    sizes=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH],
+                    sizes=[image_height // tile_height, image_width // tile_width],
                     operands=[arg2, arg3],
                 )
-                def herd_body(tx, ty, sx, sy, a, b):
+                def herd_body(tx, ty, _sx, _sy, a, b):
                     scaled_index_map_height = AffineMap.get(
                         0,
                         1,
                         [
                             AffineExpr.get_mul(
                                 AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(TILE_HEIGHT),
+                                AffineConstantExpr.get(tile_height),
                             )
                         ],
                     )
@@ -66,7 +59,7 @@ def herd_body(tx, ty, sx, sy, a, b):
                         [
                             AffineExpr.get_mul(
                                 AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(TILE_WIDTH),
+                                AffineConstantExpr.get(tile_width),
                             )
                         ],
                     )
@@ -76,7 +69,7 @@ def herd_body(tx, ty, sx, sy, a, b):
                         [
                             AffineExpr.get_mul(
                                 AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(IMAGE_WIDTH // TILE_WIDTH),
+                                AffineConstantExpr.get(image_width // tile_width),
                             )
                         ],
                     )
@@ -102,7 +95,7 @@ def herd_body(tx, ty, sx, sy, a, b):
 
                     # This is the type definition of the tile
                     tile_type = MemRefType.get(
-                        shape=TILE_SIZE,
+                        shape=tile_size,
                         element_type=T.i32(),
                         memory_space=mem_space,
                     )
@@ -116,23 +109,23 @@ def herd_body(tx, ty, sx, sy, a, b):
                         tile_in,
                         a,
                         src_offsets=[offset0, offset1],
-                        src_sizes=[TILE_HEIGHT, TILE_WIDTH],
-                        src_strides=[IMAGE_WIDTH, 1],
+                        src_sizes=tile_size,
+                        src_strides=[image_width, 1],
                     )
 
                     # Access every value in the tile
-                    for j in range_(TILE_HEIGHT):
-                        for i in range_(TILE_WIDTH):
+                    for i in range_(tile_height):
+                        for j in range_(tile_width):
                             # Load the input value from tile_in
-                            val_in = load(tile_in, [j, i])
+                            val_in = load(tile_in, [i, j])
 
                             # Compute the output value
                             val_out = arith.addi(
-                                val_in, arith.index_cast(T.i32(), compute_tile_id)
+                                val_in, arith.index_cast(xrt_dtype, compute_tile_id)
                             )
 
                             # Store the output value in tile_out
-                            store(val_out, tile_out, [j, i])
+                            store(val_out, tile_out, [i, j])
                             yield_([])
                         yield_([])
 
@@ -141,8 +134,8 @@ def herd_body(tx, ty, sx, sy, a, b):
                         b,
                         tile_out,
                         dst_offsets=[offset0, offset1],
-                        dst_sizes=[TILE_HEIGHT, TILE_WIDTH],
-                        dst_strides=[IMAGE_WIDTH, 1],
+                        dst_sizes=tile_size,
+                        dst_strides=[image_width, 1],
                     )
 
                     # Deallocate our L1 buffers
@@ -151,5 +144,69 @@ def herd_body(tx, ty, sx, sy, a, b):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    # Default values.
+    IMAGE_WIDTH = 16
+    IMAGE_HEIGHT = 32
+    TILE_WIDTH = 8
+    TILE_HEIGHT = 16
+    INOUT_DATATYPE = np.uint32
+
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=IMAGE_HEIGHT,
+        help="Height of the image data",
+    )
+    parser.add_argument(
+        "--image-width", type=int, default=IMAGE_WIDTH, help="Width of the image data"
+    )
+    parser.add_argument(
+        "--tile-height", type=int, default=TILE_HEIGHT, help="Height of the tile data"
+    )
+    parser.add_argument(
+        "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.image_height,
+        args.image_width,
+        args.tile_height,
+        args.tile_width,
+        INOUT_DATATYPE,
+    )
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    output_b = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    for i in range(args.image_height):
+        for j in range(args.image_width):
+            input_a[i, j] = i * args.image_height + j
+            tile_num = (
+                i // args.tile_height * (args.image_width // args.tile_width)
+                + j // args.tile_width
+            )
+            output_b[i, j] = input_a[i, j] + tile_num
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/run.py b/programming_examples/matrix_scalar_add/multi_core_dma/run.py
deleted file mode 100644
index 6c15708ba..000000000
--- a/programming_examples/matrix_scalar_add/multi_core_dma/run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from multi_core_dma.multi_core_dma import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/multi_core_dma example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, experimental_passes=True, verbose=args.verbose)
diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index f8dd2895e..a2c56502a 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -66,7 +66,7 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 "ChanIn",
                 a,
                 offsets=[offset0, offset1],
-                sizes=[tile_height, tile_width],
+                sizes=tile_size,
                 strides=[image_width, 1],
             )
 
@@ -75,7 +75,7 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 "ChanOut",
                 b,
                 offsets=[offset0, offset1],
-                sizes=[tile_height, tile_width],
+                sizes=tile_size,
                 strides=[image_width, 1],
             )
 
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
index b33815baa..291a824d4 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
+++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
@@ -54,7 +54,7 @@ def launch_body(a, b):
                         "ChanIn",
                         a,
                         offsets=[offset0, offset1],
-                        sizes=[tile_height, tile_width],
+                        sizes=tile_size,
                         strides=[image_width, 1],
                     )
 
@@ -63,7 +63,7 @@ def launch_body(a, b):
                         "ChanOut",
                         b,
                         offsets=[offset0, offset1],
-                        sizes=[tile_height, tile_width],
+                        sizes=tile_size,
                         strides=[image_width, 1],
                     )
                     yield_([])
diff --git a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
index f5e9a308e..5d33cb514 100644
--- a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
+++ b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
@@ -76,7 +76,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
                                 tile_in,
                                 a,
                                 src_offsets=[offset0, offset1],
-                                src_sizes=[tile_height, tile_width],
+                                src_sizes=tile_size,
                                 src_strides=[image_width, 1],
                             )
 
@@ -101,7 +101,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
                                 b,
                                 tile_out,
                                 dst_offsets=[offset0, offset1],
-                                dst_sizes=[tile_height, tile_width],
+                                dst_sizes=tile_size,
                                 dst_strides=[image_width, 1],
                             )
 

From 569a2ac4d57166e18cb1fde076734e83a08d6f46 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 18:49:28 -0600
Subject: [PATCH 12/31] fixup multi core channel

---
 .../multi_core_channel/Makefile               |  11 +-
 .../multi_core_channel/multi_core_channel.py  | 144 ++++++++++++------
 .../multi_core_channel/run.py                 |  36 -----
 3 files changed, 108 insertions(+), 83 deletions(-)
 delete mode 100644 programming_examples/matrix_scalar_add/multi_core_channel/run.py

diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/Makefile b/programming_examples/matrix_scalar_add/multi_core_channel/Makefile
index 77dc865ad..18b29d6af 100644
--- a/programming_examples/matrix_scalar_add/multi_core_channel/Makefile
+++ b/programming_examples/matrix_scalar_add/multi_core_channel/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/multi_core_channel.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/multi_core_channel.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
index 953d10fab..0babdaa25 100644
--- a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
@@ -1,42 +1,34 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
+import argparse
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
-from common import *
-
 
 def format_name(prefix, index_0, index_1):
     return f"{prefix}{index_0:02}{index_1:02}"
 
 
 @module_builder
-def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
+    assert image_height % tile_height == 0
+    assert image_width % tile_width == 0
+    image_size = [image_height, image_width]
+    tile_size = [tile_height, tile_width]
+    xrt_dtype = type_mapper(np_dtype)
+
+    memrefTyInOut = MemRefType.get(image_size, xrt_dtype)
 
     # Create an input/output channel pair per worker
-    for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
-        for w in range(IMAGE_WIDTH // TILE_WIDTH):
+    for h in range(image_height // tile_height):
+        for w in range(image_width // tile_width):
             ChannelOp(format_name("ChanIn", h, w))
             ChannelOp(format_name("ChanOut", h, w))
 
@@ -49,33 +41,33 @@ def copy(arg0, arg1):
         def launch_body(a, b):
 
             # Transfer one tile of data per worker
-            for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
-                for w in range(IMAGE_WIDTH // TILE_WIDTH):
-                    offset0 = TILE_HEIGHT * h
-                    offset1 = TILE_WIDTH * w
+            for h in range(image_height // tile_height):
+                for w in range(image_width // tile_width):
+                    offset0 = tile_height * h
+                    offset1 = tile_width * w
 
                     # Put data into the channel tile by tile
                     ChannelPut(
                         format_name("ChanIn", h, w),
                         a,
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
-                        strides=[IMAGE_WIDTH, 1],
+                        sizes=tile_size,
+                        strides=[image_width, 1],
                     )
 
             # Transfer one tile of data per worker
-            for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
-                for w in range(IMAGE_WIDTH // TILE_WIDTH):
-                    offset0 = TILE_HEIGHT * h
-                    offset1 = TILE_WIDTH * w
+            for h in range(image_height // tile_height):
+                for w in range(image_width // tile_width):
+                    offset0 = tile_height * h
+                    offset1 = tile_width * w
 
                     # Write data back out to the channel tile by tile
                     ChannelGet(
                         format_name("ChanOut", h, w),
                         b,
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
-                        strides=[IMAGE_WIDTH, 1],
+                        sizes=tile_size,
+                        strides=[image_width, 1],
                     )
 
             # The arguments are still the input and the output
@@ -83,8 +75,8 @@ def launch_body(a, b):
             def segment_body():
 
                 # Transfer one tile of data per worker
-                for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
-                    for w in range(IMAGE_WIDTH // TILE_WIDTH):
+                for h in range(image_height // tile_height):
+                    for w in range(image_width // tile_width):
 
                         @herd(name=format_name("xaddherd", h, w), sizes=[1, 1])
                         def herd_body(_tx, _ty, _sx, _sy):
@@ -93,8 +85,8 @@ def herd_body(_tx, _ty, _sx, _sy):
 
                             # This is the type definition of the tile
                             tile_type = MemRefType.get(
-                                shape=TILE_SIZE,
-                                element_type=T.i32(),
+                                shape=tile_size,
+                                element_type=xrt_dtype,
                                 memory_space=mem_space,
                             )
 
@@ -106,22 +98,22 @@ def herd_body(_tx, _ty, _sx, _sy):
                             ChannelGet(format_name("ChanIn", h, w), tile_in)
 
                             # Access every value in the tile
-                            for j in range_(TILE_HEIGHT):
-                                for i in range_(TILE_WIDTH):
+                            for i in range_(tile_height):
+                                for j in range_(tile_width):
                                     # Load the input value from tile_in
-                                    val_in = load(tile_in, [j, i])
+                                    val_in = load(tile_in, [i, j])
 
                                     # Compute the output value
                                     val_out = arith.addi(
                                         val_in,
                                         arith.ConstantOp(
-                                            T.i32(),
-                                            (IMAGE_HEIGHT // TILE_HEIGHT) * h + w,
+                                            xrt_dtype,
+                                            (image_height // tile_height) * h + w,
                                         ),
                                     )
 
                                     # Store the output value in tile_out
-                                    store(val_out, tile_out, [j, i])
+                                    store(val_out, tile_out, [i, j])
                                     yield_([])
                                 yield_([])
 
@@ -134,5 +126,69 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    # Default values.
+    IMAGE_WIDTH = 16
+    IMAGE_HEIGHT = 32
+    TILE_WIDTH = 8
+    TILE_HEIGHT = 16
+    INOUT_DATATYPE = np.uint32
+
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the passthrough_dma example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=IMAGE_HEIGHT,
+        help="Height of the image data",
+    )
+    parser.add_argument(
+        "--image-width", type=int, default=IMAGE_WIDTH, help="Width of the image data"
+    )
+    parser.add_argument(
+        "--tile-height", type=int, default=TILE_HEIGHT, help="Height of the tile data"
+    )
+    parser.add_argument(
+        "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.image_height,
+        args.image_width,
+        args.tile_height,
+        args.tile_width,
+        INOUT_DATATYPE,
+    )
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    output_b = np.zeros(
+        shape=(args.image_height, args.image_width), dtype=INOUT_DATATYPE
+    )
+    for i in range(args.image_height):
+        for j in range(args.image_width):
+            input_a[i, j] = i * args.image_height + j
+            tile_num = (
+                i // args.tile_height * (args.image_width // args.tile_width)
+                + j // args.tile_width
+            )
+            output_b[i, j] = input_a[i, j] + tile_num
+
+    runner = XRTRunner(verbose=args.verbose)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/run.py b/programming_examples/matrix_scalar_add/multi_core_channel/run.py
deleted file mode 100644
index c23496a92..000000000
--- a/programming_examples/matrix_scalar_add/multi_core_channel/run.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from multi_core_channel.multi_core_channel import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/multi_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, experimental_passes=True, verbose=args.verbose)

From 0585149d8bb25b5f77d660efdb915d3a98def4b9 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 19 Jul 2024 19:02:21 -0600
Subject: [PATCH 13/31] Fixing up multi launch example, not working currently

---
 .../multi_launch_channel.py                   | 121 ++++++++++--------
 1 file changed, 71 insertions(+), 50 deletions(-)

diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index a2c56502a..2868f3cbf 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -25,8 +25,8 @@ def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
 
     # Create two channels which will send/receive the
     # input/output data respectively
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    ChannelOp("ChanIn", size=[image_height // tile_height, image_width // tile_width])
+    ChannelOp("ChanOut", size=[image_height // tile_height, image_width // tile_width])
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
@@ -68,6 +68,7 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 offsets=[offset0, offset1],
                 sizes=tile_size,
                 strides=[image_width, 1],
+                indices=[tile_index0, tile_index1],
             )
 
             # Write data back out to the channel tile by tile
@@ -76,65 +77,85 @@ def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
                 b,
                 offsets=[offset0, offset1],
                 sizes=tile_size,
-                strides=[image_width, 1],
+                indices=[tile_index0, tile_index1],
             )
 
             # The arguments are still the input and the output
-            @segment(name="seg", operands=[a, b])
-            def segment_body(arg2, arg3):
+            @segment(name="seg", operands=[tile_index0, tile_index1])
+            def segment_body(launch_index0, launch_index1):
 
                 # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
                 # We just need one compute core, so we ask for a 1x1 herd
-                @herd(name="xaddherd", sizes=[1, 1], operands=[arg2, arg3])
-                def herd_body(tx, ty, sx, sy, a, b):
-
-                    # Loop over columns and rows of tiles
-                    for tile_num in range_(
-                        (image_width // tile_width) * (image_height // tile_height)
-                    ):
-
-                        # We want to store our data in L1 memory
-                        mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
-                        # This is the type definition of the tile
-                        tile_type = MemRefType.get(
-                            shape=tile_size,
-                            element_type=xrt_dtype,
-                            memory_space=mem_space,
-                        )
-
-                        # We must allocate a buffer of tile size for the input/output
-                        tile_in = AllocOp(tile_type, [], [])
-                        tile_out = AllocOp(tile_type, [], [])
-
-                        # Copy a tile from the input image (a) into the L1 memory region (tile_in)
-                        ChannelGet("ChanIn", tile_in)
-
-                        # Access every value in the tile
-                        for j in range_(tile_height):
-                            for i in range_(tile_width):
-                                # Load the input value from tile_in
-                                val_in = load(tile_in, [j, i])
-
-                                # Compute the output value TODO(hunhoffe): this is not correct, not sure how to percolate launch info here
-                                val_out = arith.addi(
-                                    val_in, arith.index_cast(xrt_dtype, tile_num)
-                                )
-
-                                # Store the output value in tile_out
-                                store(val_out, tile_out, [j, i])
-                                yield_([])
-                            yield_([])
+                @herd(
+                    name="xaddherd",
+                    sizes=[1, 1],
+                    operands=[launch_index0, launch_index1],
+                )
+                def herd_body(tx, ty, sx, sy, index0, index1):
+                    create_tile_index_height = AffineMap.get(
+                        0,
+                        1,
+                        [
+                            AffineExpr.get_mul(
+                                AffineSymbolExpr.get(0),
+                                AffineConstantExpr.get(image_width // tile_width),
+                            )
+                        ],
+                    )
+                    create_tile_index = AffineMap.get(
+                        0,
+                        2,
+                        [
+                            AffineExpr.get_add(
+                                AffineSymbolExpr.get(0),
+                                AffineSymbolExpr.get(1),
+                            )
+                        ],
+                    )
+                    tile_index_height = affine_apply(create_tile_index_height, [index0])
+                    tile_num = affine_apply(
+                        create_tile_index, [tile_index_height, index1]
+                    )
+
+                    # We want to store our data in L1 memory
+                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
 
-                        # Copy the output tile into the output
-                        ChannelPut("ChanOut", tile_out)
+                    # This is the type definition of the tile
+                    tile_type = MemRefType.get(
+                        shape=tile_size,
+                        element_type=xrt_dtype,
+                        memory_space=mem_space,
+                    )
 
-                        # Deallocate our L1 buffers
-                        DeallocOp(tile_in)
-                        DeallocOp(tile_out)
+                    # We must allocate a buffer of tile size for the input/output
+                    tile_in = AllocOp(tile_type, [], [])
+                    tile_out = AllocOp(tile_type, [], [])
 
+                    # Copy a tile from the input image (a) into the L1 memory region (tile_in)
+                    ChannelGet("ChanIn", tile_in, indices=[index0, index1])
+
+                    # Access every value in the tile
+                    for j in range_(tile_height):
+                        for i in range_(tile_width):
+                            # Load the input value from tile_in
+                            val_in = load(tile_in, [j, i])
+
+                            val_out = arith.addi(
+                                val_in, arith.index_cast(xrt_dtype, tile_num)
+                            )
+
+                            # Store the output value in tile_out
+                            store(val_out, tile_out, [j, i])
+                            yield_([])
                         yield_([])
 
+                    # Copy the output tile into the output
+                    ChannelPut("ChanOut", tile_out, indices=[index0, index1])
+
+                    # Deallocate our L1 buffers
+                    DeallocOp(tile_in)
+                    DeallocOp(tile_out)
+
 
 if __name__ == "__main__":
     # Default values.

From 69517a01f4b5243b01d64b7f791bcf41760714cf Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 10:05:08 -0600
Subject: [PATCH 14/31] Rewrite multi-launch channel in a way that makes more
 sense; it still fails

---
 .../multi_launch_channel.py                   | 195 +++++++-----------
 1 file changed, 77 insertions(+), 118 deletions(-)

diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index 2868f3cbf..c2c241234 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -7,12 +7,15 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.dialects.affine import apply as affine_apply
 from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 
+def format_name(prefix, index_0, index_1):
+    return f"{prefix}{index_0:02}{index_1:02}"
+
+
 @module_builder
 def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
     assert image_height % tile_height == 0
@@ -23,138 +26,94 @@ def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
 
     memrefTyInOut = MemRefType.get(image_size, xrt_dtype)
 
-    # Create two channels which will send/receive the
-    # input/output data respectively
-    ChannelOp("ChanIn", size=[image_height // tile_height, image_width // tile_width])
-    ChannelOp("ChanOut", size=[image_height // tile_height, image_width // tile_width])
+    # Create an input/output channel pair per launch
+    for h in range(image_height // tile_height):
+        for w in range(image_width // tile_width):
+            ChannelOp(format_name("ChanIn", h, w))
+            ChannelOp(format_name("ChanOut", h, w))
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
     def copy(arg0, arg1):
 
-        # The arguments are the input and output
-        @launch(
-            sizes=[image_height // tile_height, image_width // tile_width],
-            operands=[arg0, arg1],
-        )
-        def launch_body(tile_index0, tile_index1, _launch_size_x, _launch_size_y, a, b):
-            scaled_index_map_height = AffineMap.get(
-                0,
-                1,
-                [
-                    AffineExpr.get_mul(
-                        AffineSymbolExpr.get(0),
-                        AffineConstantExpr.get(tile_height),
-                    )
-                ],
-            )
-            scaled_index_map_width = AffineMap.get(
-                0,
-                1,
-                [
-                    AffineExpr.get_mul(
-                        AffineSymbolExpr.get(0),
-                        AffineConstantExpr.get(tile_width),
-                    )
-                ],
-            )
-            offset0 = affine_apply(scaled_index_map_height, [tile_index0])
-            offset1 = affine_apply(scaled_index_map_width, [tile_index1])
-
-            # Put data into the channel tile by tile
-            ChannelPut(
-                "ChanIn",
-                a,
-                offsets=[offset0, offset1],
-                sizes=tile_size,
-                strides=[image_width, 1],
-                indices=[tile_index0, tile_index1],
-            )
+        # Transfer one tile of data per worker
+        for h in range(image_height // tile_height):
+            for w in range(image_width // tile_width):
 
-            # Write data back out to the channel tile by tile
-            ChannelGet(
-                "ChanOut",
-                b,
-                offsets=[offset0, offset1],
-                sizes=tile_size,
-                indices=[tile_index0, tile_index1],
-            )
+                # The arguments are the input and output
+                @launch(name=format_name("launch", h, w), operands=[arg0, arg1])
+                def launch_body(a, b):
 
-            # The arguments are still the input and the output
-            @segment(name="seg", operands=[tile_index0, tile_index1])
-            def segment_body(launch_index0, launch_index1):
-
-                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
-                # We just need one compute core, so we ask for a 1x1 herd
-                @herd(
-                    name="xaddherd",
-                    sizes=[1, 1],
-                    operands=[launch_index0, launch_index1],
-                )
-                def herd_body(tx, ty, sx, sy, index0, index1):
-                    create_tile_index_height = AffineMap.get(
-                        0,
-                        1,
-                        [
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(image_width // tile_width),
-                            )
-                        ],
-                    )
-                    create_tile_index = AffineMap.get(
-                        0,
-                        2,
-                        [
-                            AffineExpr.get_add(
-                                AffineSymbolExpr.get(0),
-                                AffineSymbolExpr.get(1),
-                            )
-                        ],
-                    )
-                    tile_index_height = affine_apply(create_tile_index_height, [index0])
-                    tile_num = affine_apply(
-                        create_tile_index, [tile_index_height, index1]
-                    )
+                    offset0 = tile_height * h
+                    offset1 = tile_width * w
 
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
-                    # This is the type definition of the tile
-                    tile_type = MemRefType.get(
-                        shape=tile_size,
-                        element_type=xrt_dtype,
-                        memory_space=mem_space,
+                    # Put data into the channel tile by tile
+                    ChannelPut(
+                        format_name("ChanIn", h, w),
+                        a,
+                        offsets=[offset0, offset1],
+                        sizes=tile_size,
+                        strides=[image_width, 1],
                     )
 
-                    # We must allocate a buffer of tile size for the input/output
-                    tile_in = AllocOp(tile_type, [], [])
-                    tile_out = AllocOp(tile_type, [], [])
+                    # Write data back out to the channel tile by tile
+                    ChannelGet(
+                        format_name("ChanOut", h, w),
+                        b,
+                        offsets=[offset0, offset1],
+                        sizes=tile_size,
+                        strides=[image_width, 1],
+                    )
 
-                    # Copy a tile from the input image (a) into the L1 memory region (tile_in)
-                    ChannelGet("ChanIn", tile_in, indices=[index0, index1])
+                    # The arguments are still the input and the output
+                    @segment(name=format_name("segment", h, w))
+                    def segment_body():
 
-                    # Access every value in the tile
-                    for j in range_(tile_height):
-                        for i in range_(tile_width):
-                            # Load the input value from tile_in
-                            val_in = load(tile_in, [j, i])
+                        @herd(name=format_name("xaddherd", h, w), sizes=[1, 1])
+                        def herd_body(_tx, _ty, _sx, _sy):
+                            # We want to store our data in L1 memory
+                            mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
 
-                            val_out = arith.addi(
-                                val_in, arith.index_cast(xrt_dtype, tile_num)
+                            # This is the type definition of the tile
+                            tile_type = MemRefType.get(
+                                shape=tile_size,
+                                element_type=xrt_dtype,
+                                memory_space=mem_space,
                             )
 
-                            # Store the output value in tile_out
-                            store(val_out, tile_out, [j, i])
-                            yield_([])
-                        yield_([])
-
-                    # Copy the output tile into the output
-                    ChannelPut("ChanOut", tile_out, indices=[index0, index1])
-
-                    # Deallocate our L1 buffers
-                    DeallocOp(tile_in)
-                    DeallocOp(tile_out)
+                            # We must allocate a buffer of tile size for the input/output
+                            tile_in = AllocOp(tile_type, [], [])
+                            tile_out = AllocOp(tile_type, [], [])
+
+                            # Copy a tile from the input image (a) into the L1 memory region (tile_in)
+                            ChannelGet(format_name("ChanIn", h, w), tile_in)
+
+                            # Access every value in the tile
+                            for i in range_(tile_height):
+                                for j in range_(tile_width):
+                                    # Load the input value from tile_in
+                                    val_in = load(tile_in, [i, j])
+
+                                    # Compute the output value
+                                    val_out = arith.addi(
+                                        val_in,
+                                        arith.ConstantOp(
+                                            xrt_dtype,
+                                            (image_height // tile_height) * h + w,
+                                        ),
+                                    )
+
+                                    # Store the output value in tile_out
+                                    store(val_out, tile_out, [i, j])
+                                    yield_([])
+                                yield_([])
+
+                            # Copy the output tile into the output
+                            ChannelPut(format_name("ChanOut", h, w), tile_out)
+
+                            # Deallocate our L1 buffers
+                            DeallocOp(tile_in)
+                            DeallocOp(tile_out)
 
 
 if __name__ == "__main__":

From e48465ea722382aca1d82767c9b6049ad3622c5b Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 10:07:06 -0600
Subject: [PATCH 15/31] Mark multi launch test to fail

---
 .../matrix_scalar_add/multi_launch_channel/run_makefile.lit      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit b/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit
index fe881ef0f..4628dfaa6 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/run_makefile.lit
@@ -6,3 +6,4 @@
  // RUN: make -f %S/Makefile clean
  // RUN: make -f %S/Makefile run | FileCheck %s
  // CHECK: PASS!
+ // XFAIL: *

From a6ae88b2800a011d5fb1fa9ec38df6abb7c21ebe Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 11:14:53 -0600
Subject: [PATCH 16/31] Clean up shim dma 2d example

---
 programming_examples/shim_dma_2d/run.py       | 73 +++----------------
 .../shim_dma_2d/shim_dma_2d.py                | 20 ++---
 programming_examples/shim_dma_2d/test.cpp     | 49 +++++++------
 programming_examples/shim_dma_2d/test.py      | 42 +++--------
 4 files changed, 60 insertions(+), 124 deletions(-)

diff --git a/programming_examples/shim_dma_2d/run.py b/programming_examples/shim_dma_2d/run.py
index dc35fff15..e26643829 100644
--- a/programming_examples/shim_dma_2d/run.py
+++ b/programming_examples/shim_dma_2d/run.py
@@ -4,74 +4,21 @@
 # SPDX-License-Identifier: MIT
 
 import numpy as np
-import air.backend.xrt as xrt_backend
-import os
-import os.path
-import filelock
+from air.backend.xrt_runner import XRTRunner
 from shim_dma_2d import *
 
-KERNEL_NAME = "MLIR_AIE"
-
 INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-verbose = False
+VERBOSE = False
 
 
-def main():
+if __name__ == "__main__":
     mlir_module = build_module()
 
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    output_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        output_b[i] = 0x00DEFACED
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
-    )
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        mul = backend.compile_and_load(mlir_module)
-        (_, output_b) = mul(input_a, output_b)
-
-    backend.unload()
+    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
+    output_b = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
+    for h in range(TILE_HEIGHT):
+        for w in range(TILE_WIDTH):
+            output_b[h, w] = input_a[h, w]
 
-    # check output, should have the top left filled in
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-
-        row = i / IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-
-        if row < TILE_HEIGHT and col < TILE_WIDTH:
-            # value should have been updated
-            if not (rb == 0x1000 + i):
-                print(f"IM {i} [{col}, {row}] should be 0x{i:x}, is 0x{rb:x}\n")
-                errors += 1
-        else:
-            # value should stay unchanged
-            if rb != 0x00DEFACED:
-                print(
-                    f"IM {i} [{col}, {row}] should be 0xdefaced, is 0x{rb:x}\n",
-                    i,
-                    col,
-                    row,
-                    rb,
-                )
-                errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    main()
+    runner = XRTRunner(verbose=VERBOSE)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/shim_dma_2d/shim_dma_2d.py b/programming_examples/shim_dma_2d/shim_dma_2d.py
index 89d623623..a68ab3792 100644
--- a/programming_examples/shim_dma_2d/shim_dma_2d.py
+++ b/programming_examples/shim_dma_2d/shim_dma_2d.py
@@ -11,11 +11,14 @@
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
 TILE_WIDTH = 16
 TILE_HEIGHT = 8
-TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+TILE_SIZE = [TILE_HEIGHT, TILE_WIDTH]
+
+assert IMAGE_HEIGHT % TILE_HEIGHT == 0
+assert IMAGE_WIDTH % TILE_WIDTH == 0
 
 
 @module_builder
@@ -36,9 +39,8 @@ def segment_body(arg2, arg3):
 
                 # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
                 # We just need one compute core, so we ask for a 1x1 herd
-                @herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3])
-                def herd_body(tx, ty, sx, sy, a, b):
-
+                @herd(name="xaddherd", sizes=[1, 1], operands=[arg2, arg3])
+                def herd_body(_tx, _ty, _sx, _sy, a, b):
                     # We want to store our data in L1 memory
                     mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
 
@@ -58,13 +60,13 @@ def herd_body(tx, ty, sx, sy, a, b):
                         tile_in,
                         a,
                         src_offsets=[0, 0],
-                        src_sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        src_sizes=TILE_SIZE,
                         src_strides=[IMAGE_WIDTH, 1],
                     )
 
                     # Access every value in the tile
-                    for j in range_(TILE_HEIGHT):
-                        for i in range_(TILE_WIDTH):
+                    for i in range_(TILE_HEIGHT):
+                        for j in range_(TILE_WIDTH):
                             # Load the input value from tile_in
                             val = load(tile_in, [i, j])
 
@@ -78,7 +80,7 @@ def herd_body(tx, ty, sx, sy, a, b):
                         b,
                         tile_out,
                         dst_offsets=[0, 0],
-                        dst_sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        dst_sizes=TILE_SIZE,
                         dst_strides=[IMAGE_WIDTH, 1],
                     )
 
diff --git a/programming_examples/shim_dma_2d/test.cpp b/programming_examples/shim_dma_2d/test.cpp
index 3eb7a8aef..f10fa15cf 100644
--- a/programming_examples/shim_dma_2d/test.cpp
+++ b/programming_examples/shim_dma_2d/test.cpp
@@ -147,9 +147,13 @@ int main(int argc, const char *argv[]) {
   // Initialize buffers bo_inA and bo_out
   DATATYPE *bufInA = bo_inA.map<DATATYPE *>();
   DATATYPE *bufOut = bo_out.map<DATATYPE *>();
-  for (int i = 0; i < IMAGE_SIZE; i++) {
-    bufInA[i] = i + 0x1000;
-    bufOut[i] = 0x00defaced;
+
+  for (int i = 0; i < IMAGE_HEIGHT; i++) {
+    for (int j = 0; j < IMAGE_WIDTH; j++) {
+      int index = i * IMAGE_WIDTH + j;
+      bufInA[index] = index + 0x1000;
+      bufOut[index] = 0x00defaced;
+    }
   }
 
   // sync host to device memories
@@ -169,24 +173,27 @@ int main(int argc, const char *argv[]) {
 
   // check output, should have the top left filled in
   int errors = 0;
-  for (int i = 0; i < IMAGE_SIZE; i++) {
-    uint32_t rb = bufOut[i];
-
-    uint32_t row = i / IMAGE_WIDTH;
-    uint32_t col = i % IMAGE_WIDTH;
-
-    if ((row < TILE_HEIGHT) && (col < TILE_WIDTH)) {
-      // value should have been updated
-      if (!(rb == 0x1000 + i)) {
-        printf("IM %d [%d, %d] should be %08X, is %08X\n", i, col, row, i, rb);
-        errors++;
-      }
-    } else {
-      // value should stay unchanged
-      if (rb != 0x00defaced) {
-        printf("IM %d [%d, %d] should be 0xdefaced, is %08X\n", i, col, row,
-               rb);
-        errors++;
+  for (int i = 0; i < IMAGE_HEIGHT; i++) {
+    for (int j = 0; j < IMAGE_WIDTH; j++) {
+      int index = i * IMAGE_WIDTH + j;
+      uint32_t rb = bufOut[index];
+
+      if ((i < TILE_HEIGHT) && (j < TILE_WIDTH)) {
+        uint32_t expected = bufInA[index];
+
+        // value should have been updated
+        if (rb != expected) {
+          printf("IM %d [%d, %d] should be 0x%08x, is 0x%08x\n", index, i, j,
+                 expected, rb);
+          errors++;
+        }
+      } else {
+        // value should stay unchanged
+        if (rb != 0x00defaced) {
+          printf("IM %d [%d, %d] should be 0x0defaced, is %08x\n", index, i, j,
+                 rb);
+          errors++;
+        }
       }
     }
   }
diff --git a/programming_examples/shim_dma_2d/test.py b/programming_examples/shim_dma_2d/test.py
index 8502f1c44..531a79176 100644
--- a/programming_examples/shim_dma_2d/test.py
+++ b/programming_examples/shim_dma_2d/test.py
@@ -74,11 +74,14 @@ def main():
     bo_instr.write(instr_v, 0)
     bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
 
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    output_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        output_a[i] = 0x00DEFACED
+    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
+    output_a = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
+    expected_output = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
+
+    for h in range(TILE_HEIGHT):
+        for w in range(TILE_WIDTH):
+            expected_output[h, w] = input_a[h, w]
+
     bo_in.write(input_a, 0)
     bo_in.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
 
@@ -92,35 +95,12 @@ def main():
     output_buffer = bo_out.read(INOUT_SIZE_BYTES, 0).view(INOUT_DATATYPE)
 
     # check output, should have the top left filled in
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_buffer[i]
-
-        row = i / IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-
-        if row < TILE_HEIGHT and col < TILE_WIDTH:
-            # value should have been updated
-            if not (rb == 0x1000 + i):
-                print(f"IM {i} [{col}, {row}] should be 0x{i:x}, is 0x{rb:x}\n")
-                errors += 1
-        else:
-            # value should stay unchanged
-            if rb != 0x00DEFACED:
-                print(
-                    f"IM {i} [{col}, {row}] should be 0xdefaced, is 0x{rb:x}\n",
-                    i,
-                    col,
-                    row,
-                    rb,
-                )
-                errors += 1
-
-    if errors == 0:
+    actual_output = np.reshape(output_buffer, expected_output.shape)
+    if np.array_equal(actual_output, expected_output):
         print("PASS!")
         exit(0)
     else:
-        print("failed. errors=", errors)
+        print("failed")
         exit(-1)
 
 

From 14cd7fc7f5d4e1d146b606dad564dda23b798982 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 11:25:04 -0600
Subject: [PATCH 17/31] Clean up segment alloc example

---
 programming_examples/segment_alloc/run.py     | 62 +++----------------
 .../segment_alloc/segment_alloc.py            | 15 +++--
 2 files changed, 17 insertions(+), 60 deletions(-)

diff --git a/programming_examples/segment_alloc/run.py b/programming_examples/segment_alloc/run.py
index 332ae67b6..e027e57d7 100644
--- a/programming_examples/segment_alloc/run.py
+++ b/programming_examples/segment_alloc/run.py
@@ -7,70 +7,24 @@
 
 import argparse
 import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
+from air.backend.xrt_runner import XRTRunner
 
 from segment_alloc import *
 
 INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
 
 
 def main(verbose=False, experimental_passes=False):
     mlir_module = build_module()
 
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    output_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        output_b[i] = 0x00DEFACED
+    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
+    output_b = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
+    for h in range(TILE_HEIGHT):
+        for w in range(TILE_WIDTH):
+            output_b[h, w] = input_a[h, w]
 
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose,
-        experimental_passes=experimental_passes,
-        omit_while_true_loop=True,
-    )
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        mul = backend.compile_and_load(mlir_module)
-        (_, output_b) = mul(input_a, output_b)
-
-    backend.unload()
-
-    # check output, should have the top left filled in
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-
-        row = i / IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-
-        if row < TILE_HEIGHT and col < TILE_WIDTH:
-            # value should have been updated
-            if not (rb == 0x1000 + i):
-                print(f"IM {i} [{col}, {row}] should be 0x{i:x}, is 0x{rb:x}\n")
-                errors += 1
-        else:
-            # value should stay unchanged
-            if rb != 0x00DEFACED:
-                print(
-                    f"IM {i} [{col}, {row}] should be 0xdefaced, is 0x{rb:x}\n",
-                    i,
-                    col,
-                    row,
-                    rb,
-                )
-                errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
+    runner = XRTRunner(verbose=verbose, experimental_passes=experimental_passes)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
 
 
 if __name__ == "__main__":
diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py
index ad2c28a0f..a5e2f1741 100644
--- a/programming_examples/segment_alloc/segment_alloc.py
+++ b/programming_examples/segment_alloc/segment_alloc.py
@@ -11,11 +11,14 @@
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
 TILE_WIDTH = 16
 TILE_HEIGHT = 8
-TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+TILE_SIZE = [TILE_HEIGHT, TILE_WIDTH]
+
+assert IMAGE_HEIGHT % TILE_HEIGHT == 0
+assert IMAGE_WIDTH % TILE_WIDTH == 0
 
 
 @module_builder
@@ -69,7 +72,7 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
                         my_l2_tile,
                         a,
                         src_offsets=[0, 0],
-                        src_sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        src_sizes=TILE_SIZE,
                         src_strides=[IMAGE_WIDTH, 1],
                     )
 
@@ -80,8 +83,8 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
                     )
 
                     # Access every value in the tile
-                    for j in range_(TILE_HEIGHT):
-                        for i in range_(TILE_WIDTH):
+                    for i in range_(TILE_HEIGHT):
+                        for j in range_(TILE_WIDTH):
                             # Load the input value from tile_in
                             val = load(tile_in_l1, [i, j])
 
@@ -95,7 +98,7 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
                         b,
                         tile_out_l1,
                         dst_offsets=[0, 0],
-                        dst_sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        dst_sizes=TILE_SIZE,
                         dst_strides=[IMAGE_WIDTH, 1],
                     )
 

From 5091c2455597bacb148a87f6f67dc565f15488bd Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 11:32:43 -0600
Subject: [PATCH 18/31] Clean up segment_alloc code a little bit more

---
 programming_examples/segment_alloc/Makefile   |  2 +-
 programming_examples/segment_alloc/run.py     | 42 -------------------
 .../segment_alloc/segment_alloc.py            | 36 +++++++++++++++-
 3 files changed, 35 insertions(+), 45 deletions(-)
 delete mode 100644 programming_examples/segment_alloc/run.py

diff --git a/programming_examples/segment_alloc/Makefile b/programming_examples/segment_alloc/Makefile
index e25a18738..2b3464752 100644
--- a/programming_examples/segment_alloc/Makefile
+++ b/programming_examples/segment_alloc/Makefile
@@ -6,7 +6,7 @@ targetname := $(shell basename ${srcdir})
 
 run:
 	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	cd build && ${powershell} python3 ${srcdir}/segment_alloc.py
 
 clean:
 	rm -rf build __pycache__
\ No newline at end of file
diff --git a/programming_examples/segment_alloc/run.py b/programming_examples/segment_alloc/run.py
deleted file mode 100644
index e027e57d7..000000000
--- a/programming_examples/segment_alloc/run.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: MIT
-
-import argparse
-import numpy as np
-from air.backend.xrt_runner import XRTRunner
-
-from segment_alloc import *
-
-INOUT_DATATYPE = np.uint32
-
-
-def main(verbose=False, experimental_passes=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
-    output_b = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
-    for h in range(TILE_HEIGHT):
-        for w in range(TILE_WIDTH):
-            output_b[h, w] = input_a[h, w]
-
-    runner = XRTRunner(verbose=verbose, experimental_passes=experimental_passes)
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the segment_alloc example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    main(experimental_passes=True, verbose=args.verbose)
diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py
index a5e2f1741..aa1a1e926 100644
--- a/programming_examples/segment_alloc/segment_alloc.py
+++ b/programming_examples/segment_alloc/segment_alloc.py
@@ -1,11 +1,14 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
@@ -20,6 +23,8 @@
 assert IMAGE_HEIGHT % TILE_HEIGHT == 0
 assert IMAGE_WIDTH % TILE_WIDTH == 0
 
+INOUT_DATATYPE = np.uint32
+
 
 @module_builder
 def build_module():
@@ -108,5 +113,32 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the segment_alloc example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
+    output_b = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
+    for h in range(TILE_HEIGHT):
+        for w in range(TILE_WIDTH):
+            output_b[h, w] = input_a[h, w]
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))

From 505b5c182ebddde1e6b6eca076436007487f1da1 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 11:45:26 -0600
Subject: [PATCH 19/31] update multi-segment dma example

---
 .../multi_segment/multi_segment_dma/Makefile  |  2 +-
 .../multi_segment_dma/multi_segment.py        | 58 +++++++++++++------
 .../multi_segment/multi_segment_dma/run.py    | 35 -----------
 3 files changed, 42 insertions(+), 53 deletions(-)
 delete mode 100644 programming_examples/multi_segment/multi_segment_dma/run.py

diff --git a/programming_examples/multi_segment/multi_segment_dma/Makefile b/programming_examples/multi_segment/multi_segment_dma/Makefile
index 844c5686d..374d54e6b 100644
--- a/programming_examples/multi_segment/multi_segment_dma/Makefile
+++ b/programming_examples/multi_segment/multi_segment_dma/Makefile
@@ -6,7 +6,7 @@ targetname := $(shell basename ${srcdir})
 
 run:
 	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -v
+	cd build && ${powershell} python3 ${srcdir}/multi_segment.py
 
 clean:
 	rm -rf build __pycache__
diff --git a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
index fee34f23f..939bbfc4f 100644
--- a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
@@ -1,28 +1,21 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
-from common import *
+VECTOR_LEN = 32
+VECTOR_SIZE = [VECTOR_LEN, 1]
+
+INOUT_DATATYPE = np.uint32
 
 
 @module_builder
@@ -46,7 +39,6 @@ def copy(arg0, arg1, arg2, arg3):
         # The arguments are the input and output
         @launch(operands=[arg0, arg1, arg2, arg3])
         def launch_body(a, b, c, d):
-
             @segment(name="seg1", operands=[a, c])
             def segment_body(arg0, arg2):
 
@@ -96,5 +88,37 @@ def herd_body(tx, ty, sx, sy, b, d):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the segment_alloc example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.full(VECTOR_SIZE, 2, dtype=INOUT_DATATYPE)
+    input_b = np.full(VECTOR_SIZE, 3, dtype=INOUT_DATATYPE)
+    output_c = np.full(VECTOR_SIZE, 12, dtype=INOUT_DATATYPE)
+    output_d = np.full(VECTOR_SIZE, 13, dtype=INOUT_DATATYPE)
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module,
+            inputs=[input_a, input_b],
+            expected_outputs=[output_c, output_d],
+        )
+    )
diff --git a/programming_examples/multi_segment/multi_segment_dma/run.py b/programming_examples/multi_segment/multi_segment_dma/run.py
deleted file mode 100644
index a514eb5a3..000000000
--- a/programming_examples/multi_segment/multi_segment_dma/run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from multi_segment_dma.multi_segment import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the multi-segment DMA example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)

From 609709b2f80dfcc240630fdc4867fb727088319c Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 11:53:20 -0600
Subject: [PATCH 20/31] finish cleaning up multi segment

---
 programming_examples/multi_segment/common.py  | 82 -------------------
 .../multi_segment_channel/Makefile            |  2 +-
 .../multi_segment_channel/multi_segment.py    | 58 +++++++++----
 .../multi_segment_channel/run.py              | 35 --------
 4 files changed, 42 insertions(+), 135 deletions(-)
 delete mode 100644 programming_examples/multi_segment/common.py
 delete mode 100644 programming_examples/multi_segment/multi_segment_channel/run.py

diff --git a/programming_examples/multi_segment/common.py b/programming_examples/multi_segment/common.py
deleted file mode 100644
index 2d62308cb..000000000
--- a/programming_examples/multi_segment/common.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-VECTOR_LEN = 32
-VECTOR_SIZE = [VECTOR_LEN, 1]
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = VECTOR_SIZE[0] * VECTOR_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_c = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_d = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = 0x2
-        input_b[i] = 0x3
-    for i in range(INOUT_SIZE):
-        input_c[i] = 0x00C0FFEE
-        input_d[i] = 0x0000CAFE
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
-    )
-
-    if verbose:
-        print(input_a)
-        print(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, _, output_c, output_d) = addone(input_a, input_b, input_c, input_d)
-
-    backend.unload()
-
-    if verbose:
-        print(output_c)
-        print(output_d)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_c[i]
-
-        # value should have been updated
-        if not (rb == 12):
-            """
-            print(
-                f"C - IM {i} should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    for i in range(INOUT_SIZE):
-        rb = output_d[i]
-
-        # value should have been updated
-        if not (rb == 13):
-            """
-            print(
-                f"D - IM {i} should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
diff --git a/programming_examples/multi_segment/multi_segment_channel/Makefile b/programming_examples/multi_segment/multi_segment_channel/Makefile
index 844c5686d..374d54e6b 100644
--- a/programming_examples/multi_segment/multi_segment_channel/Makefile
+++ b/programming_examples/multi_segment/multi_segment_channel/Makefile
@@ -6,7 +6,7 @@ targetname := $(shell basename ${srcdir})
 
 run:
 	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -v
+	cd build && ${powershell} python3 ${srcdir}/multi_segment.py
 
 clean:
 	rm -rf build __pycache__
diff --git a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
index 6773c4875..f86574b51 100644
--- a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
@@ -1,28 +1,21 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
-from common import *
+VECTOR_LEN = 32
+VECTOR_SIZE = [VECTOR_LEN, 1]
+
+INOUT_DATATYPE = np.uint32
 
 
 @module_builder
@@ -84,7 +77,6 @@ def segment_body():
 
                 @herd(name="addherd2", sizes=[1, 1])
                 def herd_body(tx, ty, sx, sy):
-
                     image_in_b = AllocOp(image_type_l1, [], [])
                     image_out_b = AllocOp(image_type_l1, [], [])
 
@@ -105,5 +97,37 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the segment_alloc example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.full(VECTOR_SIZE, 2, dtype=INOUT_DATATYPE)
+    input_b = np.full(VECTOR_SIZE, 3, dtype=INOUT_DATATYPE)
+    output_c = np.full(VECTOR_SIZE, 12, dtype=INOUT_DATATYPE)
+    output_d = np.full(VECTOR_SIZE, 13, dtype=INOUT_DATATYPE)
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module,
+            inputs=[input_a, input_b],
+            expected_outputs=[output_c, output_d],
+        )
+    )
diff --git a/programming_examples/multi_segment/multi_segment_channel/run.py b/programming_examples/multi_segment/multi_segment_channel/run.py
deleted file mode 100644
index 37fd0fc81..000000000
--- a/programming_examples/multi_segment/multi_segment_channel/run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from multi_segment_channel.multi_segment import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the multi-segment channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)

From 2c8c3dd4352d95d3b05be84fc72c39a2932e2fed Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 13:39:18 -0600
Subject: [PATCH 21/31] Test different data types with transpose DMA

---
 .../data_transfer_transpose/channel/Makefile  |  17 ++-
 .../data_transfer_transpose/channel/run.py    |  49 --------
 .../channel/run_makefile.lit                  |   3 +-
 .../channel/transpose.py                      | 107 ++++++++++++----
 .../data_transfer_transpose/common.py         |  75 -----------
 .../data_transfer_transpose/dma/Makefile      |  17 ++-
 .../data_transfer_transpose/dma/run.py        |  49 --------
 .../dma/run_makefile.lit                      |   3 +-
 .../data_transfer_transpose/dma/transpose.py  | 117 ++++++++++++++----
 python/air/backend/xrt_runner.py              |  17 ++-
 10 files changed, 222 insertions(+), 232 deletions(-)
 delete mode 100644 programming_examples/data_transfer_transpose/channel/run.py
 delete mode 100644 programming_examples/data_transfer_transpose/common.py
 delete mode 100644 programming_examples/data_transfer_transpose/dma/run.py

diff --git a/programming_examples/data_transfer_transpose/channel/Makefile b/programming_examples/data_transfer_transpose/channel/Makefile
index d44f5e687..f488b8776 100644
--- a/programming_examples/data_transfer_transpose/channel/Makefile
+++ b/programming_examples/data_transfer_transpose/channel/Makefile
@@ -7,9 +7,18 @@ targetname := $(shell basename ${srcdir})
 M ?= 64
 K ?= 32
 
-run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -m ${M} -k ${K}
+all: run_int run_float
+
+print:
+	${powershell} python3 ${srcdir}/transpose.py -p
+
+run_int:
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/transpose.py -m ${M} -k ${K} -t uint32
+
+run_float:
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/transpose.py -m ${M} -k ${K} -t float32
 
 clean:
-	rm -rf build __pycache__
\ No newline at end of file
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/data_transfer_transpose/channel/run.py b/programming_examples/data_transfer_transpose/channel/run.py
deleted file mode 100644
index b88e25779..000000000
--- a/programming_examples/data_transfer_transpose/channel/run.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from channel.transpose import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-m",
-        type=int,
-        default=64,
-        help="The matrix to transpose will be of size M x K, this parameter sets the M value",
-    )
-    parser.add_argument(
-        "-k",
-        type=int,
-        default=32,
-        help="The matrix to transpose will be of size M x K, this parameter sets the k value",
-    )
-    args = parser.parse_args()
-    test_main(
-        build_module, m=args.m, k=args.k, experimental_passes=True, verbose=args.verbose
-    )
diff --git a/programming_examples/data_transfer_transpose/channel/run_makefile.lit b/programming_examples/data_transfer_transpose/channel/run_makefile.lit
index fe881ef0f..be8e64ba6 100644
--- a/programming_examples/data_transfer_transpose/channel/run_makefile.lit
+++ b/programming_examples/data_transfer_transpose/channel/run_makefile.lit
@@ -4,5 +4,6 @@
  // REQUIRES: ryzen_ai
  //
  // RUN: make -f %S/Makefile clean
- // RUN: make -f %S/Makefile run | FileCheck %s
+ // RUN: make -f %S/Makefile run_int | FileCheck %s
+ // RUN: make -f %S/Makefile run_float | FileCheck %s
  // CHECK: PASS!
diff --git a/programming_examples/data_transfer_transpose/channel/transpose.py b/programming_examples/data_transfer_transpose/channel/transpose.py
index f6a924ac5..f1fe6cf66 100644
--- a/programming_examples/data_transfer_transpose/channel/transpose.py
+++ b/programming_examples/data_transfer_transpose/channel/transpose.py
@@ -1,32 +1,27 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
-
-from common import *
+dtype_map = {
+    "uint32": np.uint32,
+    "float32": np.float32,
+}
+DEFAULT_DTYPE = "uint32"
 
 
 @module_builder
-def build_module(m, k):
-    memrefTyIn = MemRefType.get(shape=[m, k], element_type=T.i32())
-    memrefTyOut = MemRefType.get(shape=[k, m], element_type=T.i32())
+def build_module(m, k, dtype):
+    xrt_dtype = type_mapper(dtype)
+
+    memrefTyIn = MemRefType.get(shape=[m, k], element_type=xrt_dtype)
+    memrefTyOut = MemRefType.get(shape=[k, m], element_type=xrt_dtype)
 
     ChannelOp("ChanIn")
     ChannelOp("ChanOut")
@@ -54,7 +49,7 @@ def herd_body(_tx, _ty, _sx, _sy):
                     # This is the type definition of the tensor
                     tensor_type = MemRefType.get(
                         shape=[k * m],  # Read as one large array
-                        element_type=T.i32(),
+                        element_type=xrt_dtype,
                         memory_space=mem_space,
                     )
 
@@ -68,5 +63,75 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-m",
+        type=int,
+        default=64,
+        help="The matrix to transpose will be of size M x K, this parameter sets the M value",
+    )
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=32,
+        help="The matrix to transpose will be of size M x K, this parameter sets the k value",
+    )
+    parser.add_argument(
+        "-t",
+        "--dtype",
+        default=DEFAULT_DTYPE,
+        choices=dtype_map.keys(),
+        help="The data type of the matrix",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    np_dtype = dtype_map[args.dtype]
+    mlir_module = build_module(args.m, args.k, np_dtype)
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    # Generate a random matrix
+    matrix_shape = (args.m, args.k)
+    if np.issubdtype(np_dtype, np.floating):
+        for np_type in dtype_map.values():
+            if not np.issubdtype(np_type, np.floating):
+                if np_type.nbytes == np_dtype.nbytes:
+                    int_type_substitution = np_type
+        input_matrix = np.random.randint(
+            low=np.iinfo(int_type_substitution).min,
+            high=np.iinfo(int_type_substitution).max,
+            size=matrix_shape,
+            dtype=int_type_substitution,
+        ).astype(np_dtype)
+    else:
+        input_matrix = np.random.randint(
+            low=np.iinfo(np_dtype).min,
+            high=np.iinfo(np_dtype).max,
+            size=matrix_shape,
+            dtype=np_dtype,
+        )
+    expected_output_matrix = np.transpose(input_matrix)
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module,
+            inputs=[input_matrix],
+            expected_outputs=[expected_output_matrix],
+        )
+    )
diff --git a/programming_examples/data_transfer_transpose/common.py b/programming_examples/data_transfer_transpose/common.py
deleted file mode 100644
index 179ccb1f5..000000000
--- a/programming_examples/data_transfer_transpose/common.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: MIT
-
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-# TODO: check with different data types
-# INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-# INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def test_main(build_module, m, k, verbose=False, experimental_passes=False):
-    mlir_module = build_module(m, k)
-
-    matrix_shape = (m, k)
-    matrix_shape_t = (k, m)
-    # TODO: configure with different data types
-    matrix_dtype = np.uint32
-
-    # Generate a random matrix
-    input_matrix = np.random.randint(
-        low=0, high=2**32 - 1, size=matrix_shape, dtype=matrix_dtype
-    )
-    expected_output_matrix = np.transpose(input_matrix)
-    actual_output_matrix = np.zeros(matrix_shape_t, dtype=matrix_dtype)
-    assert expected_output_matrix.shape == actual_output_matrix.shape
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose,
-        omit_while_true_loop=True,
-        experimental_passes=experimental_passes,
-    )
-
-    if verbose:
-        print(input_matrix)
-
-    # Run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        transpose = backend.compile_and_load(mlir_module)
-        (_, actual_output_matrix) = transpose(input_matrix, actual_output_matrix)
-    backend.unload()
-
-    actual_output_matrix = actual_output_matrix.reshape(matrix_shape_t)
-    assert expected_output_matrix.shape == actual_output_matrix.shape
-
-    if verbose:
-        print("======== ORIGINAL ========")
-        print(input_matrix)
-        print("======== EXPECTED ========")
-        print(expected_output_matrix)
-        print("======== ACTUAL ==========")
-        print(actual_output_matrix)
-
-    # check output, should have all values incremented
-    errors = 0
-    for m_index in range(m):
-        for k_index in range(k):
-            expected_value = expected_output_matrix.item((k_index, m_index))
-            actual_value = actual_output_matrix.item((k_index, m_index))
-
-            if not (actual_value == expected_value):
-                """
-                print(
-                    f"IM {i} [{m_index}, {k_index}] should be 0x{expected_value:x}, is 0x{actual_value:x}\n"
-                )
-                """
-                errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
diff --git a/programming_examples/data_transfer_transpose/dma/Makefile b/programming_examples/data_transfer_transpose/dma/Makefile
index d44f5e687..f488b8776 100644
--- a/programming_examples/data_transfer_transpose/dma/Makefile
+++ b/programming_examples/data_transfer_transpose/dma/Makefile
@@ -7,9 +7,18 @@ targetname := $(shell basename ${srcdir})
 M ?= 64
 K ?= 32
 
-run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -m ${M} -k ${K}
+all: run_int run_float
+
+print:
+	${powershell} python3 ${srcdir}/transpose.py -p
+
+run_int:
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/transpose.py -m ${M} -k ${K} -t uint32
+
+run_float:
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/transpose.py -m ${M} -k ${K} -t float32
 
 clean:
-	rm -rf build __pycache__
\ No newline at end of file
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/data_transfer_transpose/dma/run.py b/programming_examples/data_transfer_transpose/dma/run.py
deleted file mode 100644
index 60c52b51f..000000000
--- a/programming_examples/data_transfer_transpose/dma/run.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from dma.transpose import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-m",
-        type=int,
-        default=64,
-        help="The matrix to transpose will be of size M x K, this parameter sets the M value",
-    )
-    parser.add_argument(
-        "-k",
-        type=int,
-        default=32,
-        help="The matrix to transpose will be of size M x K, this parameter sets the k value",
-    )
-    args = parser.parse_args()
-    test_main(
-        build_module, m=args.m, k=args.k, verbose=args.verbose, experimental_passes=True
-    )
diff --git a/programming_examples/data_transfer_transpose/dma/run_makefile.lit b/programming_examples/data_transfer_transpose/dma/run_makefile.lit
index fe881ef0f..be8e64ba6 100644
--- a/programming_examples/data_transfer_transpose/dma/run_makefile.lit
+++ b/programming_examples/data_transfer_transpose/dma/run_makefile.lit
@@ -4,5 +4,6 @@
  // REQUIRES: ryzen_ai
  //
  // RUN: make -f %S/Makefile clean
- // RUN: make -f %S/Makefile run | FileCheck %s
+ // RUN: make -f %S/Makefile run_int | FileCheck %s
+ // RUN: make -f %S/Makefile run_float | FileCheck %s
  // CHECK: PASS!
diff --git a/programming_examples/data_transfer_transpose/dma/transpose.py b/programming_examples/data_transfer_transpose/dma/transpose.py
index 9822f41ed..f849e3a88 100644
--- a/programming_examples/data_transfer_transpose/dma/transpose.py
+++ b/programming_examples/data_transfer_transpose/dma/transpose.py
@@ -1,57 +1,47 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
-
-from common import *
+dtype_map = {
+    "uint32": np.uint32,
+    "float32": np.float32,
+}
+DEFAULT_DTYPE = "uint32"
 
 
 @module_builder
-def build_module(m, k):
-    memrefTyIn = MemRefType.get(shape=[m, k], element_type=T.i32())
-    memrefTyOut = MemRefType.get(shape=[k, m], element_type=T.i32())
+def build_module(m, k, dtype):
+    xrt_dtype = type_mapper(dtype)
+
+    memrefTyIn = MemRefType.get(shape=[m, k], element_type=xrt_dtype)
+    memrefTyOut = MemRefType.get(shape=[k, m], element_type=xrt_dtype)
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyIn, memrefTyOut)
     def transpose(arg0, arg1):
 
-        # The arguments are the input and output
         @launch(operands=[arg0, arg1])
         def launch_body(a, b):
 
-            # The arguments are still the input and the output
             @segment(name="seg", operands=[a, b])
             def segment_body(arg2, arg3):
 
-                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
-                # We just need one compute core, so we ask for a 1x1 herd
                 @herd(name="herd", sizes=[1, 1], operands=[arg2, arg3])
                 def herd_body(_tx, _ty, _sx, _sy, a, b):
-
                     # We want to store our data in L1 memory
                     mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
 
                     # This is the type definition of the tensor
                     tensor_type = MemRefType.get(
                         shape=[m * k],  # Read as one large array
-                        element_type=T.i32(),
+                        element_type=xrt_dtype,
                         memory_space=mem_space,
                     )
 
@@ -73,7 +63,82 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
                     # Deallocate our L1 buffer
                     DeallocOp(tensor_in)
 
+                    # We must allocate a buffer of tile size for the input/output
+                    tensor_in = AllocOp(tensor_type, [], [])
+
+                    DeallocOp(tensor_in)
+
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-m",
+        type=int,
+        default=64,
+        help="The matrix to transpose will be of size M x K, this parameter sets the M value",
+    )
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=32,
+        help="The matrix to transpose will be of size M x K, this parameter sets the k value",
+    )
+    parser.add_argument(
+        "-t",
+        "--dtype",
+        default=DEFAULT_DTYPE,
+        choices=dtype_map.keys(),
+        help="The data type of the matrix",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    np_dtype = dtype_map[args.dtype]
+    mlir_module = build_module(args.m, args.k, np_dtype)
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    # Generate a random matrix
+    matrix_shape = (args.m, args.k)
+    if np.issubdtype(np_dtype, np.floating):
+        for np_type in dtype_map.values():
+            if not np.issubdtype(np_type, np.floating):
+                if np_type.nbytes == np_dtype.nbytes:
+                    int_type_substitution = np_type
+        input_matrix = np.random.randint(
+            low=np.iinfo(int_type_substitution).min,
+            high=np.iinfo(int_type_substitution).max,
+            size=matrix_shape,
+            dtype=int_type_substitution,
+        ).astype(np_dtype)
+    else:
+        input_matrix = np.random.randint(
+            low=np.iinfo(np_dtype).min,
+            high=np.iinfo(np_dtype).max,
+            size=matrix_shape,
+            dtype=np_dtype,
+        )
+    expected_output_matrix = np.transpose(input_matrix)
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module,
+            inputs=[input_matrix],
+            expected_outputs=[expected_output_matrix],
+        )
+    )
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index e4c88004a..0ca91573e 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -13,9 +13,22 @@
 TYPE_MAP_DICT = defaultdict(
     lambda: None,
     {
+        # Integer types
+        np.int8: T.i8,
+        np.int16: T.i16,
+        np.int32: T.i32,
+        np.int64: T.i64,
+        
+        # Unsigned Integer Types
         np.uint8: T.ui8,
-        np.uint32: T.i32,
-        # TODO: add more mappings here
+        np.uint16: T.ui16,
+        np.uint32: T.ui32,
+        np.uint64: T.ui64,
+
+        # Floating point types
+        np.float16: T.f16,
+        np.float32: T.f32,
+        np.float64: T.f64,
     },
 )
 

From 8cdef3052c20475acf49cf6ea57a0be9d1ef17e6 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 13:49:51 -0600
Subject: [PATCH 22/31] add mapping for bfloat16

---
 python/air/backend/xrt_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 0ca91573e..65f4a949d 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -9,6 +9,7 @@
 import filelock
 from typing import List
 from collections import defaultdict
+from bfloat16 import bfloat16
 
 TYPE_MAP_DICT = defaultdict(
     lambda: None,
@@ -18,17 +19,16 @@
         np.int16: T.i16,
         np.int32: T.i32,
         np.int64: T.i64,
-        
         # Unsigned Integer Types
         np.uint8: T.ui8,
         np.uint16: T.ui16,
         np.uint32: T.ui32,
         np.uint64: T.ui64,
-
         # Floating point types
         np.float16: T.f16,
         np.float32: T.f32,
         np.float64: T.f64,
+        bfloat16: T.bf16,
     },
 )
 

From bb8d107d3f6d1c377e68a866953c18fbab323094 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 14:08:51 -0600
Subject: [PATCH 23/31] Clean up makefiles

---
 .../multi_segment_channel/Makefile            | 13 ++++--
 .../multi_segment/multi_segment_dma/Makefile  | 13 ++++--
 .../passthrough/passthrough_kernel/Makefile   |  2 +-
 programming_examples/segment_alloc/Makefile   | 11 +++--
 programming_examples/shim_dma_2d/Makefile     | 40 +++++++++----------
 5 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/programming_examples/multi_segment/multi_segment_channel/Makefile b/programming_examples/multi_segment/multi_segment_channel/Makefile
index 374d54e6b..f1a1f1f9b 100644
--- a/programming_examples/multi_segment/multi_segment_channel/Makefile
+++ b/programming_examples/multi_segment/multi_segment_channel/Makefile
@@ -1,12 +1,17 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/multi_segment.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/multi_segment.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/multi_segment.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/multi_segment/multi_segment_dma/Makefile b/programming_examples/multi_segment/multi_segment_dma/Makefile
index 374d54e6b..f1a1f1f9b 100644
--- a/programming_examples/multi_segment/multi_segment_dma/Makefile
+++ b/programming_examples/multi_segment/multi_segment_dma/Makefile
@@ -1,12 +1,17 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/multi_segment.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/multi_segment.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/multi_segment.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/passthrough/passthrough_kernel/Makefile b/programming_examples/passthrough/passthrough_kernel/Makefile
index bd5df8d8e..4b94c23e7 100644
--- a/programming_examples/passthrough/passthrough_kernel/Makefile
+++ b/programming_examples/passthrough/passthrough_kernel/Makefile
@@ -20,7 +20,7 @@ VPATH := ${MLIR_AIE_DIR}/aie_kernels/generic
 all: run
 
 print:
-	${powershell} python3 ${srcdir}/passthrough_dma.py -p
+	${powershell} python3 ${srcdir}/passthrough_kernel.py -p
 
 ${srcdir}/build/passThrough.cc.o: ${VPATH}/passThrough.cc
 	mkdir -p ${srcdir}/build
diff --git a/programming_examples/segment_alloc/Makefile b/programming_examples/segment_alloc/Makefile
index 2b3464752..ea855ed47 100644
--- a/programming_examples/segment_alloc/Makefile
+++ b/programming_examples/segment_alloc/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/segment_alloc.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/segment_alloc.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/segment_alloc.py
 
 clean:
-	rm -rf build __pycache__
\ No newline at end of file
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/shim_dma_2d/Makefile b/programming_examples/shim_dma_2d/Makefile
index 9e7b68a9c..d8fef893c 100644
--- a/programming_examples/shim_dma_2d/Makefile
+++ b/programming_examples/shim_dma_2d/Makefile
@@ -3,42 +3,42 @@
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
-all: build/final.xclbin
+all: ${srcdir}/build/final.xclbin
 
 targetname := $(shell basename ${srcdir})
 
-build/air.mlir: ${srcdir}/${targetname}.py
+${srcdir}/build/air.mlir: ${srcdir}/${targetname}.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-build/final.xclbin: build/air.mlir
+${srcdir}/build/final.xclbin: ${srcdir}/build/air.mlir
 	mkdir -p ${@D}
-	cd build && aircc.py -xbridge -o ${@F} --tmpdir tmp --device npu1_1col --host-target x86_64 --experimental-passes $(<:%=../%)
+	cd ${srcdir}/build && aircc.py -xbridge -o ${@F} --tmpdir tmp --device npu1_1col --host-target x86_64 --experimental-passes ${srcdir}/build/air.mlir
 
-build/final.py.xclbin: build/air.mlir
+${srcdir}/build/final.py.xclbin: ${srcdir}/build/air.mlir
 	mkdir -p ${@D}
-	cd build && python3 build.py $(<:%=../%)
+	cd ${srcdir}/build && python3 build.py ${srcdir}/build/air.mlir
 
-${targetname}.exe: ${srcdir}/test.cpp
-	rm -rf _build
-	mkdir -p _build
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} 
-	cd _build && ${powershell} cmake --build . --config Release
+${srcdir}/${targetname}.exe: ${srcdir}/test.cpp
+	rm -rf ${srcdir}/_build
+	mkdir -p ${srcdir}/_build
+	cd ${srcdir}/_build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} 
+	cd ${srcdir}/_build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
-	cp _build/${targetname}.exe $@
+	cp ${srcdir}/_build/${targetname}.exe $@
 else
-	cp _build/${targetname} $@ 
+	cp ${srcdir}/_build/${targetname} $@ 
 endif
 
-run: ${targetname}.exe build/final.xclbin build/final.insts.txt 
-	${powershell} ./$< -x build/final.xclbin -i build/final.insts.txt -k MLIR_AIE
+run: ${srcdir}/${targetname}.exe ${srcdir}/build/final.xclbin ${srcdir}/build/final.insts.txt 
+	${powershell} $< -x build/final.xclbin -i build/final.insts.txt -k MLIR_AIE
 
-run_py: build/final.xclbin build/final.insts.txt 
-	${powershell} python3 ${srcdir}/test.py build/final.xclbin build/final.insts.txt 
+run_py: ${srcdir}/build/final.xclbin ${srcdir}/build/final.insts.txt 
+	${powershell} python3 ${srcdir}/test.py ${srcdir}/build/final.xclbin ${srcdir}/build/final.insts.txt 
 
 pyworkflow:
-	mkdir -p pybuild
-	cd pybuild && ${powershell} python3 ${srcdir}/run.py 
+	mkdir -p ${srcdir}/pybuild
+	cd ${srcdir}/pybuild && ${powershell} python3 ${srcdir}/run.py 
 
 clean:
-	rm -rf build _build pybuild tmp ${targetname}.exe
+	rm -rf ${srcdir}/build ${srcdir}/_build ${srcdir}/pybuild tmp ${targetname}.exe

From 8e3c4a714ed2c3e6288669a7ff7dee0ae013f5a1 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 14:17:25 -0600
Subject: [PATCH 24/31] Fix some bugs

---
 .../multi_core_channel/multi_core_channel.py  |  2 +-
 .../multi_core_dma/multi_core_dma.py          |  2 +-
 .../multi_launch_channel.py                   |  2 +-
 .../multi_launch_channel/run.py               | 35 -------------------
 .../single_core_channel.py                    |  2 +-
 .../single_core_dma/single_core_dma.py        |  2 +-
 programming_examples/shim_dma_2d/Makefile     |  2 +-
 7 files changed, 6 insertions(+), 41 deletions(-)
 delete mode 100644 programming_examples/matrix_scalar_add/multi_launch_channel/run.py

diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
index 0babdaa25..02a3f0157 100644
--- a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
@@ -131,7 +131,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     IMAGE_HEIGHT = 32
     TILE_WIDTH = 8
     TILE_HEIGHT = 16
-    INOUT_DATATYPE = np.uint32
+    INOUT_DATATYPE = np.int32
 
     parser = argparse.ArgumentParser(
         prog="run.py",
diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
index 9cfb01a55..9bb21c1e2 100644
--- a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
+++ b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
@@ -149,7 +149,7 @@ def herd_body(tx, ty, _sx, _sy, a, b):
     IMAGE_HEIGHT = 32
     TILE_WIDTH = 8
     TILE_HEIGHT = 16
-    INOUT_DATATYPE = np.uint32
+    INOUT_DATATYPE = np.int32
 
     parser = argparse.ArgumentParser(
         prog="run.py",
diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index c2c241234..17249ecf7 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -122,7 +122,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     IMAGE_HEIGHT = 32
     TILE_WIDTH = 8
     TILE_HEIGHT = 16
-    INOUT_DATATYPE = np.uint32
+    INOUT_DATATYPE = np.int32
 
     parser = argparse.ArgumentParser(
         prog="run.py",
diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/run.py b/programming_examples/matrix_scalar_add/multi_launch_channel/run.py
deleted file mode 100644
index 8c869d4d3..000000000
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from multi_launch_channel.multi_launch_channel import build_module
-from common import test_main
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/multi_launch_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, experimental_passes=False, verbose=args.verbose)
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
index 291a824d4..b48d41f82 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
+++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
@@ -132,7 +132,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     IMAGE_HEIGHT = 32
     TILE_WIDTH = 8
     TILE_HEIGHT = 16
-    INOUT_DATATYPE = np.uint32
+    INOUT_DATATYPE = np.int32
 
     parser = argparse.ArgumentParser(
         prog="run.py",
diff --git a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
index 5d33cb514..496e7d114 100644
--- a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
+++ b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
@@ -119,7 +119,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
     IMAGE_HEIGHT = 32
     TILE_WIDTH = 8
     TILE_HEIGHT = 16
-    INOUT_DATATYPE = np.uint32
+    INOUT_DATATYPE = np.int32
 
     parser = argparse.ArgumentParser(
         prog="run.py",
diff --git a/programming_examples/shim_dma_2d/Makefile b/programming_examples/shim_dma_2d/Makefile
index d8fef893c..fa8c17b2d 100644
--- a/programming_examples/shim_dma_2d/Makefile
+++ b/programming_examples/shim_dma_2d/Makefile
@@ -41,4 +41,4 @@ pyworkflow:
 	cd ${srcdir}/pybuild && ${powershell} python3 ${srcdir}/run.py 
 
 clean:
-	rm -rf ${srcdir}/build ${srcdir}/_build ${srcdir}/pybuild tmp ${targetname}.exe
+	rm -rf ${srcdir}/build ${srcdir}/_build ${srcdir}/pybuild tmp ${srcdir}/${targetname}.exe

From ee31b76a00095a9f7789f65cdcab0a112644101a Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 14:40:42 -0600
Subject: [PATCH 25/31] Clean up channel herd_to_herd examples

---
 .../herd_to_herd/multi_segment/Makefile       | 13 ++-
 .../multi_segment/herd_to_herd.py             | 43 +++++++--
 .../herd_to_herd/multi_segment/run.py         | 91 -------------------
 .../herd_to_herd/single_segment/Makefile      | 13 ++-
 .../single_segment/herd_to_herd.py            | 39 +++++++-
 .../herd_to_herd/single_segment/run.py        | 89 ------------------
 6 files changed, 88 insertions(+), 200 deletions(-)
 delete mode 100644 programming_examples/channel_examples/herd_to_herd/multi_segment/run.py
 delete mode 100644 programming_examples/channel_examples/herd_to_herd/single_segment/run.py

diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/Makefile b/programming_examples/channel_examples/herd_to_herd/multi_segment/Makefile
index 844c5686d..0a9e5a8ad 100644
--- a/programming_examples/channel_examples/herd_to_herd/multi_segment/Makefile
+++ b/programming_examples/channel_examples/herd_to_herd/multi_segment/Makefile
@@ -1,12 +1,17 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/herd_to_herd.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -v
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/herd_to_herd.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
index fd1726a9b..fd1d0ee57 100644
--- a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
@@ -1,17 +1,22 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
+
+INOUT_DATATYPE = np.uint32
 
 
 @module_builder
@@ -63,8 +68,8 @@ def herd_body(tx, ty, sx, sy):
                     ChannelGet("ChanIn", image_in)
 
                     # Access every value in the image
-                    for j in range_(IMAGE_HEIGHT):
-                        for i in range_(IMAGE_WIDTH):
+                    for i in range_(IMAGE_HEIGHT):
+                        for j in range_(IMAGE_WIDTH):
                             # Load the input value
                             val_in = load(image_in, [i, j])
 
@@ -94,8 +99,8 @@ def herd_body(tx, ty, sx, sy):
                     ChannelGet("Herd2Herd", image_in)
 
                     # Access every value in the image
-                    for j in range_(IMAGE_HEIGHT):
-                        for i in range_(IMAGE_WIDTH):
+                    for i in range_(IMAGE_HEIGHT):
+                        for j in range_(IMAGE_WIDTH):
                             # Load the input value
                             val_in = load(image_in, [i, j])
 
@@ -114,5 +119,29 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the herd_to_herd channel example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE)
+    output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE)
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/run.py b/programming_examples/channel_examples/herd_to_herd/multi_segment/run.py
deleted file mode 100644
index 03b84d1dc..000000000
--- a/programming_examples/channel_examples/herd_to_herd/multi_segment/run.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-from herd_to_herd import *
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = 0x2
-        input_b[i] = 0x00C0FFEE
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
-    )
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-
-        row = i // IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-
-        # value should have been updated
-        expected_value = 0x2 * 0x2 + 1
-        if not (rb == expected_value):
-            """
-            print(
-                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the herd-to-herd multi-segment example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/Makefile b/programming_examples/channel_examples/herd_to_herd/single_segment/Makefile
index 77dc865ad..0a9e5a8ad 100644
--- a/programming_examples/channel_examples/herd_to_herd/single_segment/Makefile
+++ b/programming_examples/channel_examples/herd_to_herd/single_segment/Makefile
@@ -1,12 +1,17 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/herd_to_herd.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/herd_to_herd.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
index 770023345..a21f4792c 100644
--- a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
@@ -1,5 +1,7 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
@@ -8,12 +10,15 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
+
+INOUT_DATATYPE = np.uint32
 
 
 @module_builder
@@ -87,8 +92,8 @@ def herd_body(tx, ty, sx, sy):
                     ChannelGet("Herd2Herd", image_in)
 
                     # Access every value in the image
-                    for j in range_(IMAGE_HEIGHT):
-                        for i in range_(IMAGE_WIDTH):
+                    for i in range_(IMAGE_HEIGHT):
+                        for j in range_(IMAGE_WIDTH):
                             # Load the input value
                             val_in = load(image_in, [i, j])
 
@@ -107,5 +112,29 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the herd_to_herd channel example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE)
+    output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE)
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=False)
+    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/run.py b/programming_examples/channel_examples/herd_to_herd/single_segment/run.py
deleted file mode 100644
index df5468a0f..000000000
--- a/programming_examples/channel_examples/herd_to_herd/single_segment/run.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-from herd_to_herd import *
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = 0x2
-        input_b[i] = 0x00C0FFEE
-
-    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-
-        row = i // IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-
-        # value should have been updated
-        expected_value = 0x2 * 0x2 + 1
-        if not (rb == expected_value):
-            """
-            print(
-                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the herd-to-herd multi-segment example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)

From 9af812aa3868e7eb2d7dd11786e2017ba75b9d32 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 14:43:39 -0600
Subject: [PATCH 26/31] revert bad makefile changes

---
 programming_examples/shim_dma_2d/Makefile | 40 +++++++++++------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/programming_examples/shim_dma_2d/Makefile b/programming_examples/shim_dma_2d/Makefile
index fa8c17b2d..c5a5d4c46 100644
--- a/programming_examples/shim_dma_2d/Makefile
+++ b/programming_examples/shim_dma_2d/Makefile
@@ -3,42 +3,42 @@
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
-all: ${srcdir}/build/final.xclbin
+all: build/final.xclbin
 
 targetname := $(shell basename ${srcdir})
 
-${srcdir}/build/air.mlir: ${srcdir}/${targetname}.py
+build/air.mlir: ${srcdir}/${targetname}.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-${srcdir}/build/final.xclbin: ${srcdir}/build/air.mlir
+build/final.xclbin: build/air.mlir
 	mkdir -p ${@D}
-	cd ${srcdir}/build && aircc.py -xbridge -o ${@F} --tmpdir tmp --device npu1_1col --host-target x86_64 --experimental-passes ${srcdir}/build/air.mlir
+	cd build && aircc.py -xbridge -o ${@F} --tmpdir tmp --device npu1_1col --host-target x86_64 --experimental-passes $(<:%=../%)
 
-${srcdir}/build/final.py.xclbin: ${srcdir}/build/air.mlir
+build/final.py.xclbin: build/air.mlir
 	mkdir -p ${@D}
-	cd ${srcdir}/build && python3 build.py ${srcdir}/build/air.mlir
+	cd build && python3 build.py $(<:%=../%)
 
-${srcdir}/${targetname}.exe: ${srcdir}/test.cpp
-	rm -rf ${srcdir}/_build
-	mkdir -p ${srcdir}/_build
-	cd ${srcdir}/_build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} 
-	cd ${srcdir}/_build && ${powershell} cmake --build . --config Release
+${targetname}.exe: ${srcdir}/test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} 
+	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
-	cp ${srcdir}/_build/${targetname}.exe $@
+	cp _build/${targetname}.exe $@
 else
-	cp ${srcdir}/_build/${targetname} $@ 
+	cp _build/${targetname} $@ 
 endif
 
-run: ${srcdir}/${targetname}.exe ${srcdir}/build/final.xclbin ${srcdir}/build/final.insts.txt 
-	${powershell} $< -x build/final.xclbin -i build/final.insts.txt -k MLIR_AIE
+run: ${targetname}.exe build/final.xclbin build/final.insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/final.insts.txt -k MLIR_AIE
 
-run_py: ${srcdir}/build/final.xclbin ${srcdir}/build/final.insts.txt 
-	${powershell} python3 ${srcdir}/test.py ${srcdir}/build/final.xclbin ${srcdir}/build/final.insts.txt 
+run_py: build/final.xclbin build/final.insts.txt 
+	${powershell} python3 ${srcdir}/test.py build/final.xclbin build/final.insts.txt 
 
 pyworkflow:
-	mkdir -p ${srcdir}/pybuild
-	cd ${srcdir}/pybuild && ${powershell} python3 ${srcdir}/run.py 
+	mkdir -p pybuild
+	cd pybuild && ${powershell} python3 ${srcdir}/run.py 
 
 clean:
-	rm -rf ${srcdir}/build ${srcdir}/_build ${srcdir}/pybuild tmp ${srcdir}/${targetname}.exe
+	rm -rf build _build pybuild tmp ${targetname}.exe
\ No newline at end of file

From e3706bb79ba5e995f627e4c7ecfcd6bc7a80af72 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 15:12:44 -0600
Subject: [PATCH 27/31] Fix up channel size example

---
 .../channel_examples/channel_size/Makefile    | 13 ++-
 .../channel_size/channel_size.py              | 74 +++++++++++----
 .../channel_examples/channel_size/run.py      | 91 -------------------
 .../multi_segment_channel/multi_segment.py    |  2 +-
 .../multi_segment_dma/multi_segment.py        |  2 +-
 5 files changed, 67 insertions(+), 115 deletions(-)
 delete mode 100644 programming_examples/channel_examples/channel_size/run.py

diff --git a/programming_examples/channel_examples/channel_size/Makefile b/programming_examples/channel_examples/channel_size/Makefile
index 7f494aba2..91fcfa854 100644
--- a/programming_examples/channel_examples/channel_size/Makefile
+++ b/programming_examples/channel_examples/channel_size/Makefile
@@ -1,12 +1,17 @@
-# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/channel_size.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/channel_size.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
index 3f64db5e1..29fe7fb61 100644
--- a/programming_examples/channel_examples/channel_size/channel_size.py
+++ b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -1,24 +1,29 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner
 
 range_ = for_
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
 TILE_WIDTH = 16
 TILE_HEIGHT = 8
-TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+TILE_SIZE = [TILE_HEIGHT, TILE_WIDTH]
 
-assert IMAGE_WIDTH % TILE_WIDTH == 0
 assert IMAGE_HEIGHT % TILE_HEIGHT == 0
+assert IMAGE_WIDTH % TILE_WIDTH == 0
+
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
@@ -26,8 +31,8 @@ def build_module():
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
 
     # Create an input/output channel pair per worker
-    ChannelOp("ChanIn", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
-    ChannelOp("ChanOut", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
+    ChannelOp("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
+    ChannelOp("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
@@ -40,8 +45,8 @@ def launch_body(a, b):
             # Transfer one tile of data per worker
             for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
                 for w in range(IMAGE_WIDTH // TILE_WIDTH):
-                    offset0 = IMAGE_HEIGHT * h
-                    offset1 = IMAGE_HEIGHT * w
+                    offset0 = TILE_HEIGHT * h
+                    offset1 = TILE_WIDTH * w
 
                     # Put data into the channel tile by tile
                     ChannelPut(
@@ -49,15 +54,15 @@ def launch_body(a, b):
                         a,
                         indices=[w, h],
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        sizes=TILE_SIZE,
                         strides=[IMAGE_WIDTH, 1],
                     )
 
             # Transfer one tile of data per worker
             for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
                 for w in range(IMAGE_WIDTH // TILE_WIDTH):
-                    offset0 = IMAGE_HEIGHT * h
-                    offset1 = IMAGE_HEIGHT * w
+                    offset0 = TILE_HEIGHT * h
+                    offset1 = TILE_WIDTH * w
 
                     # Write data back out to the channel tile by tile
                     ChannelGet(
@@ -65,7 +70,7 @@ def launch_body(a, b):
                         b,
                         indices=[w, h],
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        sizes=TILE_SIZE,
                         strides=[IMAGE_WIDTH, 1],
                     )
 
@@ -75,7 +80,7 @@ def segment_body():
 
                 @herd(
                     name="xaddherd",
-                    sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT],
+                    sizes=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH],
                 )
                 def herd_body(th, tw, _sx, _sy):
 
@@ -94,11 +99,11 @@ def herd_body(th, tw, _sx, _sy):
                     tile_out = AllocOp(tile_type, [], [])
 
                     # Copy a tile from the input image (a) into the L1 memory region (tile_in)
-                    ChannelGet("ChanIn", tile_in, indices=[tw, th])
+                    ChannelGet("ChanIn", tile_in, indices=[th, tw])
 
                     # Access every value in the tile
-                    for j in range_(TILE_HEIGHT):
-                        for i in range_(TILE_WIDTH):
+                    for i in range_(TILE_HEIGHT):
+                        for j in range_(TILE_WIDTH):
                             # Load the input value from tile_in
                             val = load(tile_in, [i, j])
 
@@ -108,7 +113,7 @@ def herd_body(th, tw, _sx, _sy):
                         yield_([])
 
                     # Copy the output tile into the output
-                    ChannelPut("ChanOut", tile_out, indices=[tw, th])
+                    ChannelPut("ChanOut", tile_out, indices=[th, tw])
 
                     # Deallocate our L1 buffers
                     DeallocOp(tile_in)
@@ -116,5 +121,38 @@ def herd_body(th, tw, _sx, _sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel_size example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_matrix = np.random.randint(
+        low=np.iinfo(INOUT_DATATYPE).min,
+        high=np.iinfo(INOUT_DATATYPE).max,
+        size=IMAGE_SIZE,
+        dtype=INOUT_DATATYPE,
+    )
+    output_matrix = input_matrix.copy()
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        )
+    )
diff --git a/programming_examples/channel_examples/channel_size/run.py b/programming_examples/channel_examples/channel_size/run.py
deleted file mode 100644
index 0a3bb2dd3..000000000
--- a/programming_examples/channel_examples/channel_size/run.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-from channel_size import *
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        input_b[i] = 0x00DEFACED
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
-    )
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-        expected_value = input_a[i]
-
-        row = i // IMAGE_WIDTH
-        col = i % IMAGE_WIDTH
-
-        # value should have been updated
-        if not (rb == expected_value):
-            """
-            print(
-                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel_examples/herd_to_herd example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
index f86574b51..eb7ae0775 100644
--- a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
@@ -99,7 +99,7 @@ def herd_body(tx, ty, sx, sy):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="run.py",
-        description="Builds, runs, and tests the segment_alloc example",
+        description="Builds, runs, and tests the multi segment channel example",
     )
     parser.add_argument(
         "-v",
diff --git a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
index 939bbfc4f..27153ce47 100644
--- a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
@@ -90,7 +90,7 @@ def herd_body(tx, ty, sx, sy, b, d):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="run.py",
-        description="Builds, runs, and tests the segment_alloc example",
+        description="Builds, runs, and tests the multi segment dma example",
     )
     parser.add_argument(
         "-v",

From 17c1347ebe825f14e11d51ee92d112ba640bb816 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 15:28:21 -0600
Subject: [PATCH 28/31] Fixup datatype mismatch between uint32 and int32 in
 programming examples

---
 .../channel_examples/channel_size/channel_size.py   |  7 ++++---
 .../herd_to_herd/multi_segment/herd_to_herd.py      | 11 ++++++-----
 .../herd_to_herd/single_segment/herd_to_herd.py     | 11 ++++++-----
 .../multi_segment_channel/multi_segment.py          | 13 +++++++------
 .../multi_segment_dma/multi_segment.py              | 13 +++++++------
 programming_examples/segment_alloc/segment_alloc.py | 11 ++++++-----
 programming_examples/shim_dma_2d/run.py             |  2 +-
 programming_examples/shim_dma_2d/test.py            |  2 +-
 8 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
index 29fe7fb61..8aa113896 100644
--- a/programming_examples/channel_examples/channel_size/channel_size.py
+++ b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -8,7 +8,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
@@ -28,7 +28,8 @@
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # Create an input/output channel pair per worker
     ChannelOp("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
@@ -90,7 +91,7 @@ def herd_body(th, tw, _sx, _sy):
                     # This is the type definition of the tile
                     tile_type = MemRefType.get(
                         shape=TILE_SIZE,
-                        element_type=T.i32(),
+                        element_type=xrt_dtype,
                         memory_space=mem_space,
                     )
 
diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
index fd1d0ee57..9f9e6a4c8 100644
--- a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
@@ -8,7 +8,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
@@ -16,12 +16,13 @@
 IMAGE_HEIGHT = 16
 IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # We want to store our data in L1 memory
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
@@ -29,7 +30,7 @@ def build_module():
     # This is the type definition of the tile
     image_type_l1 = MemRefType.get(
         shape=IMAGE_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l1,
     )
 
@@ -105,7 +106,7 @@ def herd_body(tx, ty, sx, sy):
                             val_in = load(image_in, [i, j])
 
                             # Calculate the output value
-                            val_out = arith.addi(val_in, arith.ConstantOp(T.i32(), 1))
+                            val_out = arith.addi(val_in, arith.ConstantOp(xrt_dtype, 1))
 
                             # Store the output value
                             store(val_out, image_out, [i, j])
diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
index a21f4792c..729802f4d 100644
--- a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
@@ -10,7 +10,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
@@ -18,12 +18,13 @@
 IMAGE_HEIGHT = 16
 IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # We want to store our data in L1 memory
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
@@ -31,7 +32,7 @@ def build_module():
     # This is the type definition of the tile
     image_type_l1 = MemRefType.get(
         shape=IMAGE_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l1,
     )
 
@@ -98,7 +99,7 @@ def herd_body(tx, ty, sx, sy):
                             val_in = load(image_in, [i, j])
 
                             # Calculate the output value
-                            val_out = arith.addi(val_in, arith.ConstantOp(T.i32(), 1))
+                            val_out = arith.addi(val_in, arith.ConstantOp(xrt_dtype, 1))
 
                             # Store the output value
                             store(val_out, image_out, [i, j])
diff --git a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
index eb7ae0775..705cec06f 100644
--- a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
@@ -8,19 +8,20 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 VECTOR_LEN = 32
 VECTOR_SIZE = [VECTOR_LEN, 1]
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(VECTOR_SIZE, T.i32())
+    xrt_dtype = INOUT_DATATYPE
+    memrefTyInOut = MemRefType.get(VECTOR_SIZE, xrt_dtype)
 
     # We want to store our data in L1 memory
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
@@ -28,7 +29,7 @@ def build_module():
     # This is the type definition of the tile
     image_type_l1 = MemRefType.get(
         shape=VECTOR_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l1,
     )
 
@@ -64,7 +65,7 @@ def herd_body(tx, ty, sx, sy):
                     c0 = arith.ConstantOp.create_index(0)
                     for j in range_(VECTOR_LEN):
                         val_a = load(image_in_a, [c0, j])
-                        val_outa = arith.addi(val_a, arith.constant(T.i32(), 10))
+                        val_outa = arith.addi(val_a, arith.constant(xrt_dtype, 10))
                         store(val_outa, image_out_a, [c0, j])
                         yield_([])
 
@@ -86,7 +87,7 @@ def herd_body(tx, ty, sx, sy):
                     c0 = arith.ConstantOp.create_index(0)
                     for j in range_(VECTOR_LEN):
                         val_b = load(image_in_b, [c0, j])
-                        val_outb = arith.addi(arith.constant(T.i32(), 10), val_b)
+                        val_outb = arith.addi(arith.constant(xrt_dtype, 10), val_b)
                         store(val_outb, image_out_b, [c0, j])
                         yield_([])
 
diff --git a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
index 27153ce47..dd459cab0 100644
--- a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
@@ -8,19 +8,20 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 VECTOR_LEN = 32
 VECTOR_SIZE = [VECTOR_LEN, 1]
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(VECTOR_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(VECTOR_SIZE, xrt_dtype)
 
     # We want to store our data in L1 memory
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
@@ -28,7 +29,7 @@ def build_module():
     # This is the type definition of the tile
     image_type_l1 = MemRefType.get(
         shape=VECTOR_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l1,
     )
 
@@ -54,7 +55,7 @@ def herd_body(tx, ty, sx, sy, a, c):
                     c0 = arith.ConstantOp.create_index(0)
                     for j in range_(VECTOR_LEN):
                         val_a = load(image_in_a, [c0, j])
-                        val_outa = arith.addi(val_a, arith.constant(T.i32(), 10))
+                        val_outa = arith.addi(val_a, arith.constant(xrt_dtype, 10))
                         store(val_outa, image_out_a, [c0, j])
                         yield_([])
 
@@ -77,7 +78,7 @@ def herd_body(tx, ty, sx, sy, b, d):
                     c0 = arith.ConstantOp.create_index(0)
                     for j in range_(VECTOR_LEN):
                         val_b = load(image_in_b, [c0, j])
-                        val_outb = arith.addi(arith.constant(T.i32(), 10), val_b)
+                        val_outb = arith.addi(arith.constant(xrt_dtype, 10), val_b)
                         store(val_outb, image_out_b, [c0, j])
                         yield_([])
 
diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py
index aa1a1e926..6addee6df 100644
--- a/programming_examples/segment_alloc/segment_alloc.py
+++ b/programming_examples/segment_alloc/segment_alloc.py
@@ -8,7 +8,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
@@ -23,12 +23,13 @@
 assert IMAGE_HEIGHT % TILE_HEIGHT == 0
 assert IMAGE_WIDTH % TILE_WIDTH == 0
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
@@ -47,7 +48,7 @@ def segment_body(arg2, arg3):
                 # This is the type definition of the tile
                 tile_type_l2 = MemRefType.get(
                     shape=TILE_SIZE,
-                    element_type=T.i32(),
+                    element_type=xrt_dtype,
                     memory_space=mem_space_l2,
                 )
 
@@ -65,7 +66,7 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
                     # This is the type definition of the tile
                     tile_type_l1 = MemRefType.get(
                         shape=TILE_SIZE,
-                        element_type=T.i32(),
+                        element_type=xrt_dtype,
                         memory_space=mem_space_l1,
                     )
 
diff --git a/programming_examples/shim_dma_2d/run.py b/programming_examples/shim_dma_2d/run.py
index e26643829..b81cba1bd 100644
--- a/programming_examples/shim_dma_2d/run.py
+++ b/programming_examples/shim_dma_2d/run.py
@@ -7,7 +7,7 @@
 from air.backend.xrt_runner import XRTRunner
 from shim_dma_2d import *
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 VERBOSE = False
 
 
diff --git a/programming_examples/shim_dma_2d/test.py b/programming_examples/shim_dma_2d/test.py
index 531a79176..eff5de024 100644
--- a/programming_examples/shim_dma_2d/test.py
+++ b/programming_examples/shim_dma_2d/test.py
@@ -15,7 +15,7 @@
 
 KERNEL_NAME = "MLIR_AIE"
 
-INOUT_DATATYPE = np.uint32
+INOUT_DATATYPE = np.int32
 INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
 INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
 INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE

From a69df68ed1c71c4df775275558c418d3a2c0b957 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 15:37:36 -0600
Subject: [PATCH 29/31] Fix up hierarchical example

---
 .../channel_examples/hierarchical/Makefile    | 13 ++-
 .../hierarchical/hierarchical.py              | 56 ++++++++++--
 .../channel_examples/hierarchical/run.py      | 85 -------------------
 3 files changed, 56 insertions(+), 98 deletions(-)
 delete mode 100644 programming_examples/channel_examples/hierarchical/run.py

diff --git a/programming_examples/channel_examples/hierarchical/Makefile b/programming_examples/channel_examples/hierarchical/Makefile
index 77dc865ad..3ca296020 100644
--- a/programming_examples/channel_examples/hierarchical/Makefile
+++ b/programming_examples/channel_examples/hierarchical/Makefile
@@ -1,12 +1,17 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/hierarchical.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/hierarchical.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/channel_examples/hierarchical/hierarchical.py b/programming_examples/channel_examples/hierarchical/hierarchical.py
index 8ad41bdd4..f9969bb82 100644
--- a/programming_examples/channel_examples/hierarchical/hierarchical.py
+++ b/programming_examples/channel_examples/hierarchical/hierarchical.py
@@ -1,34 +1,40 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
+
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
     mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
 
     image_type_l1 = MemRefType.get(
         shape=IMAGE_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l1,
     )
     image_type_l2 = MemRefType.get(
         shape=IMAGE_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l2,
     )
 
@@ -70,13 +76,13 @@ def herd_body(tx, ty, sx, sy):
                     ChannelGet("ChanInL1", image_in)
 
                     # Access every value in the image
-                    for j in range_(IMAGE_HEIGHT):
-                        for i in range_(IMAGE_WIDTH):
+                    for i in range_(IMAGE_HEIGHT):
+                        for j in range_(IMAGE_WIDTH):
                             # Load the input value
                             val_in = load(image_in, [i, j])
 
                             # Calculate the output value
-                            val_out = arith.addi(val_in, arith.ConstantOp(T.i32(), 1))
+                            val_out = arith.addi(val_in, arith.ConstantOp(xrt_dtype, 1))
 
                             # Store the output value
                             store(val_out, image_out, [i, j])
@@ -90,5 +96,37 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel hierarchical example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_matrix = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+    output_matrix = np.arange(1, np.prod(IMAGE_SIZE) + 1, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        )
+    )
diff --git a/programming_examples/channel_examples/hierarchical/run.py b/programming_examples/channel_examples/hierarchical/run.py
deleted file mode 100644
index 335f022e7..000000000
--- a/programming_examples/channel_examples/hierarchical/run.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-from hierarchical import *
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i
-        input_b[i] = 0x00C0FFEE
-
-    backend = xrt_backend.XRTBackend(
-        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
-    )
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-
-        # value should have been updated
-        expected_value = i + 1
-        if not (rb == expected_value):
-            if verbose:
-                print(f"IM {i} should be 0x{expected_value:x}, is 0x{rb:x}\n")
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel_examples/hierarchical example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)

From f5f921483996f7553d6b82640f2ad3d5266454c4 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Mon, 29 Jul 2024 17:07:10 -0600
Subject: [PATCH 30/31] clean up worker to worker

---
 .../channel_size/channel_size.py              |  6 +-
 .../channel_examples/worker_to_self/Makefile  | 11 ++-
 .../channel_examples/worker_to_self/run.py    | 88 ------------------
 .../worker_to_self/worker_to_self.py          | 52 +++++++++--
 .../worker_to_worker/Makefile                 | 11 ++-
 .../channel_examples/worker_to_worker/run.py  | 88 ------------------
 .../worker_to_worker/worker_to_worker.py      | 92 +++++++++++++------
 7 files changed, 127 insertions(+), 221 deletions(-)
 delete mode 100644 programming_examples/channel_examples/worker_to_self/run.py
 delete mode 100644 programming_examples/channel_examples/worker_to_worker/run.py

diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
index 8aa113896..89d335acf 100644
--- a/programming_examples/channel_examples/channel_size/channel_size.py
+++ b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -12,7 +12,7 @@
 
 range_ = for_
 
-IMAGE_WIDTH = 32
+IMAGE_WIDTH = 48
 IMAGE_HEIGHT = 16
 IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
@@ -53,7 +53,7 @@ def launch_body(a, b):
                     ChannelPut(
                         "ChanIn",
                         a,
-                        indices=[w, h],
+                        indices=[h, w],
                         offsets=[offset0, offset1],
                         sizes=TILE_SIZE,
                         strides=[IMAGE_WIDTH, 1],
@@ -69,7 +69,7 @@ def launch_body(a, b):
                     ChannelGet(
                         "ChanOut",
                         b,
-                        indices=[w, h],
+                        indices=[h, w],
                         offsets=[offset0, offset1],
                         sizes=TILE_SIZE,
                         strides=[IMAGE_WIDTH, 1],
diff --git a/programming_examples/channel_examples/worker_to_self/Makefile b/programming_examples/channel_examples/worker_to_self/Makefile
index 79be368b8..44cd8ea52 100644
--- a/programming_examples/channel_examples/worker_to_self/Makefile
+++ b/programming_examples/channel_examples/worker_to_self/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/worker_to_self.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -v
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/worker_to_self.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/channel_examples/worker_to_self/run.py b/programming_examples/channel_examples/worker_to_self/run.py
deleted file mode 100644
index c5b7825d5..000000000
--- a/programming_examples/channel_examples/worker_to_self/run.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-from worker_to_self import *
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        input_b[i] = 0x00DEFACED
-
-    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-        expected_value = input_a[i]
-
-        # value should have been updated
-        if not (rb == expected_value):
-            """
-            row = i // IMAGE_WIDTH
-            col = i % IMAGE_WIDTH
-            print(
-                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel_examples/worker_to_self example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
index ee9f68293..0d19f24eb 100644
--- a/programming_examples/channel_examples/worker_to_self/worker_to_self.py
+++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
@@ -1,24 +1,30 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
 IMAGE_WIDTH = 32
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
+
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
 
     # Type and method of input/output
-    memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, T.i32())
+    memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, xrt_dtype)
     ChannelOp("ChanIn")
     ChannelOp("ChanOut")
     ChannelOp("ToSelf")
@@ -26,14 +32,14 @@ def build_module():
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
     image_type_l1 = MemRefType.get(
         shape=IMAGE_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l1,
     )
 
     mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
     image_type_l2 = MemRefType.get(
         shape=IMAGE_SIZE,
-        element_type=T.i32(),
+        element_type=xrt_dtype,
         memory_space=mem_space_l2,
     )
 
@@ -71,8 +77,8 @@ def herd_body(tx, ty, sx, sy, tensor_in_l2):
                     ChannelGet("ToSelf", tensor_in_l1)
 
                     # Access every value in the tile
-                    for j in range_(IMAGE_HEIGHT):
-                        for i in range_(IMAGE_WIDTH):
+                    for i in range_(IMAGE_HEIGHT):
+                        for j in range_(IMAGE_WIDTH):
                             # Load the input value from tile_in
                             val = load(tensor_in_l1, [i, j])
 
@@ -91,5 +97,35 @@ def herd_body(tx, ty, sx, sy, tensor_in_l2):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel worker_to_self example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_matrix = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+    output_matrix = input_matrix.copy()
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        )
+    )
diff --git a/programming_examples/channel_examples/worker_to_worker/Makefile b/programming_examples/channel_examples/worker_to_worker/Makefile
index 79be368b8..7b179eadc 100644
--- a/programming_examples/channel_examples/worker_to_worker/Makefile
+++ b/programming_examples/channel_examples/worker_to_worker/Makefile
@@ -4,9 +4,14 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 targetname := $(shell basename ${srcdir})
 
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/worker_to_worker.py -p
+
 run:
-	mkdir -p build
-	cd build && ${powershell} python3 ${srcdir}/run.py -v
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/worker_to_worker.py
 
 clean:
-	rm -rf build __pycache__
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/channel_examples/worker_to_worker/run.py b/programming_examples/channel_examples/worker_to_worker/run.py
deleted file mode 100644
index b62fc4746..000000000
--- a/programming_examples/channel_examples/worker_to_worker/run.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# run.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-import argparse
-import numpy as np
-import air.backend.xrt as xrt_backend
-import filelock
-
-from worker_to_worker import *
-
-INOUT_DATATYPE = np.uint32
-INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
-INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
-INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
-
-
-def print_matrix(matrix_array):
-    for i in range(IMAGE_HEIGHT):
-        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
-        for val in row:
-            val = val & 0xFFFF
-            print(f"{val:04x}", end=" ")
-        print("")
-
-
-def test_main(build_module, verbose=False):
-    mlir_module = build_module()
-
-    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
-    for i in range(INOUT_SIZE):
-        input_a[i] = i + 0x1000
-        input_b[i] = 0x00DEFACED
-
-    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
-
-    if verbose:
-        print_matrix(input_b)
-
-    # run the module
-    with filelock.FileLock("/tmp/npu.lock"):
-        addone = backend.compile_and_load(mlir_module)
-        (_, output_b) = addone(input_a, input_b)
-
-    backend.unload()
-
-    if verbose:
-        print_matrix(output_b)
-
-    # check output, should have all values incremented
-    errors = 0
-    for i in range(INOUT_SIZE):
-        rb = output_b[i]
-        expected_value = input_a[i]
-
-        # value should have been updated
-        if not (rb == expected_value):
-            """
-            row = i // IMAGE_WIDTH
-            col = i % IMAGE_WIDTH
-            print(
-                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
-            )
-            """
-            errors += 1
-
-    if errors == 0:
-        print("PASS!")
-        exit(0)
-    else:
-        print("failed. errors=", errors)
-        exit(-1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel_examples/worker_to_worker example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
index e1b8fa256..a88b5ba06 100644
--- a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
+++ b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
@@ -1,5 +1,7 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
 
 from air.ir import *
 from air.dialects.air import *
@@ -7,30 +9,34 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.affine import apply as affine_apply
+from air.backend.xrt_runner import XRTRunner, type_mapper
 
 range_ = for_
 
-IMAGE_WIDTH = 32
+IMAGE_WIDTH = 48
 IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
 
 TILE_WIDTH = 16
 TILE_HEIGHT = 8
-TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+TILE_SIZE = [TILE_HEIGHT, TILE_WIDTH]
 
-assert IMAGE_WIDTH % TILE_WIDTH == 0
 assert IMAGE_HEIGHT % TILE_HEIGHT == 0
+assert IMAGE_WIDTH % TILE_WIDTH == 0
+
+INOUT_DATATYPE = np.int32
 
 
 @module_builder
 def build_module():
-    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # Create an input/output channel pair per worker
-    ChannelOp("ChanIn", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
-    ChannelOp("ChanOut", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
+    ChannelOp("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
+    ChannelOp("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
     ChannelOp(
-        "SwitchTiles", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT]
+        "SwitchTiles", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH]
     )
 
     # We will send an image worth of data in and out
@@ -44,32 +50,32 @@ def launch_body(a, b):
             # Transfer one tile of data per worker
             for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
                 for w in range(IMAGE_WIDTH // TILE_WIDTH):
-                    offset0 = IMAGE_HEIGHT * h
-                    offset1 = IMAGE_HEIGHT * w
+                    offset0 = TILE_HEIGHT * h
+                    offset1 = TILE_WIDTH * w
 
                     # Put data into the channel tile by tile
                     ChannelPut(
                         "ChanIn",
                         a,
-                        indices=[w, h],
+                        indices=[h, w],
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        sizes=TILE_SIZE,
                         strides=[IMAGE_WIDTH, 1],
                     )
 
             # Transfer one tile of data per worker
             for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
                 for w in range(IMAGE_WIDTH // TILE_WIDTH):
-                    offset0 = IMAGE_HEIGHT * h
-                    offset1 = IMAGE_HEIGHT * w
+                    offset0 = TILE_HEIGHT * h
+                    offset1 = TILE_WIDTH * w
 
                     # Write data back out to the channel tile by tile
                     ChannelGet(
                         "ChanOut",
                         b,
-                        indices=[w, h],
+                        indices=[h, w],
                         offsets=[offset0, offset1],
-                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        sizes=TILE_SIZE,
                         strides=[IMAGE_WIDTH, 1],
                     )
 
@@ -79,7 +85,7 @@ def segment_body():
 
                 @herd(
                     name="xaddherd",
-                    sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT],
+                    sizes=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH],
                 )
                 def herd_body(th, tw, _sx, _sy):
                     height_next = AffineMap.get(
@@ -115,7 +121,7 @@ def herd_body(th, tw, _sx, _sy):
                     mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
                     tile_type = MemRefType.get(
                         shape=TILE_SIZE,
-                        element_type=T.i32(),
+                        element_type=xrt_dtype,
                         memory_space=mem_space,
                     )
 
@@ -126,11 +132,11 @@ def herd_body(th, tw, _sx, _sy):
                     tile_out2 = AllocOp(tile_type, [], [])
 
                     # Copy a tile from the input image
-                    ChannelGet("ChanIn", tile_in, indices=[tw, th])
+                    ChannelGet("ChanIn", tile_in, indices=[th, tw])
 
                     # Access every value in the tile
-                    for j in range_(TILE_HEIGHT):
-                        for i in range_(TILE_WIDTH):
+                    for i in range_(TILE_HEIGHT):
+                        for j in range_(TILE_WIDTH):
                             # Load the input value from tile_in
                             val = load(tile_in, [i, j])
 
@@ -140,14 +146,14 @@ def herd_body(th, tw, _sx, _sy):
                         yield_([])
 
                     # Copy the output tile into a channel for another worker to get
-                    ChannelPut("SwitchTiles", tile_out, indices=[tw, th])
+                    ChannelPut("SwitchTiles", tile_out, indices=[th, tw])
 
                     # Get an output tile from another worker
-                    ChannelGet("SwitchTiles", tile_in2, indices=[tw_next, th_next])
+                    ChannelGet("SwitchTiles", tile_in2, indices=[th_next, tw_next])
 
                     # Access every value in the tile
-                    for j in range_(TILE_HEIGHT):
-                        for i in range_(TILE_WIDTH):
+                    for i in range_(TILE_HEIGHT):
+                        for j in range_(TILE_WIDTH):
                             # Load the input value from tile_in
                             val = load(tile_in2, [i, j])
 
@@ -157,7 +163,7 @@ def herd_body(th, tw, _sx, _sy):
                         yield_([])
 
                     # Send the output tile to the output
-                    ChannelPut("ChanOut", tile_out, indices=[tw, th])
+                    ChannelPut("ChanOut", tile_out, indices=[th, tw])
 
                     # Deallocate our L1 buffers
                     DeallocOp(tile_in)
@@ -167,5 +173,35 @@ def herd_body(th, tw, _sx, _sy):
 
 
 if __name__ == "__main__":
-    module = build_module()
-    print(module)
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel worker_to_worker example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_matrix = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE)
+    # TODO: this check is not fully correct, should show how data is transferred explicitly
+    # Will probably want to use different input to implement a more rigorous check.
+    output_matrix = input_matrix.copy()
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        )
+    )

From 497a591b5c83f37368e59e3b4a96d24dc3a6beff Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 30 Jul 2024 10:07:38 -0600
Subject: [PATCH 31/31] update programming example documentation

---
 programming_examples/README.md                | 16 +++--
 .../channel_examples/README.md                | 58 +++++++++----------
 .../data_transfer_transpose/README.md         | 22 ++++++-
 .../matrix_scalar_add/README.md               | 51 ++++++++--------
 programming_examples/multi_segment/README.md  | 39 +++++++++++++
 programming_examples/passthrough/README.md    | 13 ++++-
 programming_examples/segment_alloc/README.md  | 41 +++++++++++++
 programming_examples/shim_dma_2d/README.md    | 11 +++-
 8 files changed, 184 insertions(+), 67 deletions(-)
 create mode 100644 programming_examples/multi_segment/README.md
 create mode 100644 programming_examples/segment_alloc/README.md

diff --git a/programming_examples/README.md b/programming_examples/README.md
index 10e4d3aef..df3d1e28a 100644
--- a/programming_examples/README.md
+++ b/programming_examples/README.md
@@ -4,26 +4,30 @@ These programming examples are provided so that application programmers can lear
 
 ## [2-Dimensional Shim DMA Passthrough](shim_dma_2d)
 
-This example demonstrates how data may be moved using shim DMA operations. It also includes extra infrastructure that illustrates different ways to compile, build, run, and test programs written using the mlir-air python bindings.
+This example demonstrates how data may be moved using shim DMA operations. It also includes extra infrastructure that illustrates different ways to compile, build, run, and test programs written using the mlir-air python bindings on an NPU.
 
 ## [Passthrough Examples](passthrough)
 
-Three examples that copy data from the input to the output (a data passthrough). The data movement is done through either DMA or Channels, and there is a simple example of calling a an external function which performs a vectorized memcopy.
+This directory contains three examples that each copy data from the input to the output (a data passthrough). The data movement is done through either DMA or Channels, and there is a simple example of calling a an external function which performs a vectorized memcopy.
 
 ## [Channel Examples](channel_examples)
 
-This is a collection of simple examples that illustrate how to use channels.
+This is a collection of simple examples that illustrate how to use *channels*. At a high level, channels are the abstraction for data movement in mlir-air. Some of the examples are experimental works-in-progress.
 
 ## [Matrix Scalar Addition](matrix_scalar_add)
 
-This example provides logic to divide in input 2D matrix into *tiles* of data, and add a value to every element in every tile. It includes some description of the fundamental concepts of mlir-air, including *launches*, *herds*, and *channels*.
+This example provides logic to divide an input 2D matrix into *tiles* of data, and add a value to every element in every tile. It includes some description of the fundamental concepts of mlir-air, including *launches*, *herds*, and *channels*. There are five different implementations of this example, some of which are experimental (and are currently works-in-progress).
 
 ## [Data Transfer Transpose](data_transfer_transpose)
 
-Transposes a matrix with using either Channels or `dma_memcpy_nd`.
+Transposes a matrix with using either air channels or `dma_memcpy_nd`.
+
+## [Segment Alloc](segment_alloc)
+
+While a *worker* (a compute unit managed as part of a *herd*) are able to allocate L1 memory, they are not able to allocate L2 memory. This must be done in the *segment*. This example shows how a segment can allocate L2 memory which is then accessed within the herd.
 
 ## [WIP: Multi-Segment Examples](multi_segment)
 
-This is a collection of simple examples that illustrate how to use multiple segments. 
+This is a collection of simple examples that illustrate how to use multiple segments.
 
 Warning: This example is a work-in-progress.
diff --git a/programming_examples/channel_examples/README.md b/programming_examples/channel_examples/README.md
index 7c1cd3e11..ce5e89fc4 100644
--- a/programming_examples/channel_examples/README.md
+++ b/programming_examples/channel_examples/README.md
@@ -1,57 +1,32 @@
 # Channel Examples
 
-This example focuses on one of the key abstractions of air: *channels*. This is a collection of examples that use channels in various ways. The patterns shown here may be used to create more complex examples.
+This collection of examples focuses on one of the key abstractions of air: *channels*. The patterns shown here may be used to create more complex examples.
 
 ## Running and Testing
 
 #### ```herd-to-herd```: Using a channel to pass data between herd.
 
-There are two part of this example: two herds within one segment (single segment), and one herd per segment for two segments (multi-segment)
+There are two part of this example: two herds within one segment (single segment), and one herd per segment for two segments (multi-segment).
 
-The single segment example example ([herd_to_herd/single_segment/herd_to_herd.py](herd_to_herd/single_segment/herd_to_herd.py)) defines two `herd`s within the same `launch` + `segment`. There is a *producer herd*, which writes data to a `Herd2Herd` channel, and a *consumer herd*, which reads data form the `Herd2Herd` channel.
-
-```bash
-cd herd_to_herd/single_segment
-make clean && make
-```
+The single segment example example ([herd_to_herd/single_segment/herd_to_herd.py](herd_to_herd/single_segment/herd_to_herd.py)) defines two *herds* within the same *launch* and *segment*. There is a *producer herd*, which writes data to a `Herd2Herd` channel, and a *consumer herd*, which reads data form the `Herd2Herd` channel.
 
 The multi-segment example ([herd_to_herd/multi_segment/herd_to_herd.py](herd_to_herd/multi_segment/herd_to_herd.py)) defines two `segment`s, each with one `herd`, within the same `launch`. There is a *producer_segment* with a *producer herd*, which writes data to a `Herd2Herd` channel, and a *consumer_segment* with a *consumer herd*, which reads data form the `Herd2Herd` channel.
 
 Warning: The multi-segment example is a work in progress!
 
-```bash
-cd herd_to_herd/multi_segment
-make clean && make
-```
-
 #### ```channel-size```: Use the channel size argument
 
 This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `ChannelOp` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).
 
-```bash
-cd channel_size
-make clean && make
-```
-
 #### ```hierarchical```: Use channels for sending data from Launch to Segment to Herd and back again
 
 This example ([hierarchical/hierarchical.py](hierarchical/hierarchical.py)) is a data passthrough example that uses a channel to send data from Launch to Segment (L3->L2 memory) and then from Segment to Herd (L2->L1 memory). The data is then sent back on an analogous path.
 
-```bash
-cd hierarchical
-make clean && make
-```
-
 #### WIP: ```worker-to-self```:
 
 This example ([worker_to_self/worker_to_self.py](worker_to_self/worker_to_self.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the sole worker in the herd does some extra shuffling between input and output by putting the current data tile into a channel and then getting it from the same channel.
 
-WARNING: This example currently fails because it is assumed channel gets/parts are not from the same memory region, and this example breaks this assumption.
-
-```bash
-cd worker_to_self
-make clean && make
-```
+WARNING: This example currently fails for unknown reasons.
 
 #### WIP: ```worker-to-worker```:
 
@@ -59,9 +34,28 @@ This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_
 
 WARNING: This example currently fails for unknown reasons.
 
+#### Usage (For All Examples)
+
+To generate AIR MLIR from Python:
 ```bash
-cd worker_to_worker
+cd <example_dir>
+make clean && make print
+```
+
+To run:
+```bash
+cd <example_dir>
 make clean && make
-``
+```
 
-#### WIP: more examples!
+To run with verbose output:
+```bash
+cd <example_dir>
+python <example_file>.py -v
+```
+
+You may be able to configure examples (data types, sizes); to get additional usage information, run:
+```bash
+cd <example_dir>
+python <example_file>.py -h
+```
diff --git a/programming_examples/data_transfer_transpose/README.md b/programming_examples/data_transfer_transpose/README.md
index d73acfe6f..772c0eb2f 100644
--- a/programming_examples/data_transfer_transpose/README.md
+++ b/programming_examples/data_transfer_transpose/README.md
@@ -2,8 +2,28 @@
 
 Transposes a matrix with using either Channels or `dma_memcpy_nd`.
 
-#### Build and Run
+#### Usage (For Both Examples)
 
+To generate AIR MLIR from Python:
 ```bash
+cd <example_dir>
+make clean && make print
+```
+
+To run:
+```bash
+cd <example_dir>
 make clean && make
 ```
+
+To run with verbose output:
+```bash
+cd <example_dir>
+python transpose.py -v
+```
+
+You can also change some other parameters; to get usage information, run:
+```bash
+cd <example_dir>
+python transpose.py -h
+```
diff --git a/programming_examples/matrix_scalar_add/README.md b/programming_examples/matrix_scalar_add/README.md
index 505126b58..9ed611771 100644
--- a/programming_examples/matrix_scalar_add/README.md
+++ b/programming_examples/matrix_scalar_add/README.md
@@ -2,55 +2,56 @@
 
 This example focuses on a core concept: processing input as a grid of smaller inputs. In this case, each implementation of the matrix scalar addition program breaks a 2-dimensional matrix of input data (the *image*) into smaller 2-dimensional regions (the *tiles*), and then increments every value in each tile with a constant specific to that tile.
 
-There are several versions of this example that use memcopies, but there are also some versions of this example that use *channels*, the primary abstraction used to represent data movement provided by the mlir-air python bindings. In this example, there is an input channel (`ChanIn`) and an output channel (`ChanOut`). The data is moved into/out of channels to/from the arguments in the mlir-air *launch*; the data is then retrieved from the input channel, processed, and written back to the output channel at the *herd* level.
+There are several versions of this example that use DMA memcopies, but there are also some versions of this example that use *channels*, the primary abstraction used to represent data movement provided by the mlir-air python bindings. In this example, there is an input channel (`ChanIn`) and an output channel (`ChanOut`). The data is moved into/out of channels to/from the arguments in the mlir-air *launch*; the data is then retrieved from the input channel, processed, and written back to the output channel at the *herd* level.
 
 ## Running and Testing
 
-For illustrative purposes, there are several versions of this example: ```single-core-dma```, ```multi-core-dma```, ```single-core-channel```, ```multi-core-channel```, and ```multi-launch-channel```.
+For illustrative purposes, there are several versions of this example: ```single_core_dma```, ```multi_core_dma```, ```single_core_channel```, ```multi_core_channel```, and ```multi_launch_channel```. Note that ```multi_launch_channel``` is a WIP and is not functional as multiple launches are not yet supported.
 
 #### ```single-core-dma```: Tiling using DMA sizes, offsets, and strides
 
 This example ([single_core_dma/single_core_dma.py](single_core_dma/single_core_dma.py)) uses *sizes*, *offsets*, and *strides* to explicitly loop *n* total tiles, and then fetch, increment, and put one tile at a time. The entirety of the work is done by one launch which manages one segment which manages one herd which manages one core.
 
-```bash
-cd single_core_dma
-make clean && make
-```
-
-#### [WIP] ```multi-core-dma```: Tiling using DMA sizes, offsets, and strides with multiple compute cores
+#### ```multi-core-dma```: Tiling using DMA sizes, offsets, and strides with multiple compute cores
 
 This example ([multi_core_dma/single_core_dma.py](multi_core_dma/multi_core_dma.py)) uses *sizes*, *offsets*, and *strides*. Unlike the `single-core-dma` example, this example uses a herd size that maps to the 2-dimensional number of tiles in the image. No explicit loop is needed to processes each tile as each compute tile will process exactly one tile of data. The entirety of the work is done by one launch which manages one segment which manages one herd which manages *n* cores, where *n* is the number of data tiles.
 
-```bash
-cd multi_core_dma
-make clean && make
-```
-
 #### ```single-core-channel```: Tiling using Channel sizes, offsets, and strides
 
 This example ([single_core_dma/single_core_dma.py](single_core_dma/single_core_dma.py)) uses *sizes*, *offsets*, and *strides* to explicitly loop *n* total tiles, and then fetch, increment, and put one tile at a time. L3 data must be written to/from channels at the launch level, so the launch transforms the sequential image data to/from a sequence of sequential tile data using a series of specially constructed `ChannelPut` and `ChannelGet` operations. The compute core is then able to access each data tile by tile using simple `ChannelPut` and `ChannelGet` operations.
 
-```bash
-cd single_core_dma
-make clean && make
-```
-
-#### [WIP] ```multi-core-channel```: Tiling using Channel sizes, offsets, and strides with multiple compute cores
+#### ```multi-core-channel```: Tiling using Channel sizes, offsets, and strides with multiple compute cores
 
 This example ([multi_core_dma/single_core_dma.py](multi_core_dma/multi_core_dma.py)) uses *sizes*, *offsets*, and *strides* to explicitly loop *n* total tiles, and then fetch, increment, and put one tile at a time. L3 data must be written to/from channels at the launch level, so the launch transforms the sequential image data to/from a sequence of sequential tile data using a series of specially constructed `ChannelPut` and `ChannelGet` operations. Unlike the `single-core-channel` example, this example uses a herd size that maps to the 2-dimensional number of tiles in the image. No explicit loop is needed to processes each tile as each compute tile will process exactly one tile of data. The entirety of the work is done by one launch which manages one segment which manages one herd which manages *n* cores, where *n* is the number of data tiles.
 
+#### [WIP] ```multi-launch-channel```: This example is under construction
+
+This example ([multi_launch_channel/multi_launch_channel.py](multi_launch_channel/multi_launch_channel.py)) uses multiple launches. It is currently a work in progress as multiple launches are not yet fully supported.
+
+#### Usage (For All Examples)
+
+To generate AIR MLIR from Python:
 ```bash
-cd multi_core_dma
-make clean && make
+cd <example_dir>
+make clean && make print
 ```
 
-#### [WIP] ```multi-launch-channel```: This example is under construction
+To run:
+```bash
+cd <example_dir>
+make clean && make
+```
 
-This example ([multi_launch_channel/multi_launch_channel.py](multi_launch_channel/multi_launch_channel.py)) uses multiple launches. It is currently incomplete.
+To run with verbose output:
+```bash
+cd <example_dir>
+python <example_file>.py -v
+```
 
+You can also change some other parameters; to get usage information, run:
 ```bash
-cd multi_launch_channel
-make clean && make
+cd <example_dir>
+python <example_file>.py -h
 ```
 
 ## Recommended Exercises
diff --git a/programming_examples/multi_segment/README.md b/programming_examples/multi_segment/README.md
new file mode 100644
index 000000000..a6a84bc36
--- /dev/null
+++ b/programming_examples/multi_segment/README.md
@@ -0,0 +1,39 @@
+<!---//===- README.md -----------------------------------------*- Markdown -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+// 
+//===----------------------------------------------------------------------===//-->
+
+# WIP: Multi-Segment Examples
+
+These two examples are attempts to create AIR programs using multiple segments. For simplicity, the segments are logically independent (they do not communicate with each other).
+
+Warning! Neither of the examples are functional. The design has been checked in each case by running it with one segment or the other, but in it's entirety, multiple segments are not yet supported.
+
+## Usage
+
+### Generate AIR MLIR from Python
+
+Run:
+```bash
+make print
+```
+OR 
+
+```bash
+python multi_segment.py -p
+```
+
+### Running
+
+To compile and run the design:
+
+```bash
+make
+```
+
+To run with verbose output, either modify the makefile or specify verbose directly with:
+```bash
+python multi_segment.py -v
+```
diff --git a/programming_examples/passthrough/README.md b/programming_examples/passthrough/README.md
index 3302a38f5..cda1adc01 100644
--- a/programming_examples/passthrough/README.md
+++ b/programming_examples/passthrough/README.md
@@ -11,7 +11,7 @@ This set of passthrough designs demonstrates a simple MLIR-AIR implementation fo
 
 ## Source Files Overview
 
-1. [`passthrough_dma/passthrough_dma.py`](./passthrough_dma/passthrough_dma.py), [`passthrough_channel/passthrough_channel.py`](passthrough_channel/passthrough_channel.py), [`passthrough_kernel/passthrough_kernel.py`](passthrough_kernel/passthrough_kernel.py): Python scripts that defines the module design for each example using MLIR-AIR Python bindings. The file generates MLIR that is then compiled using `aircc.py` to produce design binaries (i.e. `XCLBIN` and `inst.txt` for the NPU in Ryzen™ AI). You can run `python passthrough_(dma|channel|kernel).py` to generate the MLIR.
+1. [`passthrough_dma/passthrough_dma.py`](./passthrough_dma/passthrough_dma.py), [`passthrough_channel/passthrough_channel.py`](passthrough_channel/passthrough_channel.py), [`passthrough_kernel/passthrough_kernel.py`](passthrough_kernel/passthrough_kernel.py): Python scripts that defines the module design for each example using MLIR-AIR Python bindings. The file generates MLIR that is then compiled using `aircc.py` to produce design binaries (i.e. `XCLBIN` and `inst.txt` for the NPU in Ryzen™ AI). You can run `python passthrough_(dma|channel|kernel).py -p` or `make print` to generate the AIR MLIR.
 
 1. `passThrough.cc`: A C++ implementation of vectorized memcpy operations for AIE cores. It is found in the [mlir-aie repo](https://github.com/Xilinx/mlir-aie) under [`mlir-aie/aie_kernels/generic/passThrough.cc`](https://github.com/Xilinx/mlir-aie/blob/main/aie_kernels/generic/passThrough.cc)
 
@@ -23,10 +23,21 @@ See the [design overview](https://github.com/Xilinx/mlir-aie/tree/main/programmi
 
 ## Usage
 
+### Generate AIR MLIR from Python
+
+```bash
+make print
+```
+
 ### Running
 
 To compile and run the design:
 
 ```bash
 make
+```
+
+To run with verbose settings, either modify the makefile, or run directly:
+```bash
+python passthrough_(dma|channel|kernel).py -v
 ```
\ No newline at end of file
diff --git a/programming_examples/segment_alloc/README.md b/programming_examples/segment_alloc/README.md
new file mode 100644
index 000000000..e8d1f1748
--- /dev/null
+++ b/programming_examples/segment_alloc/README.md
@@ -0,0 +1,41 @@
+<!---//===- README.md -----------------------------------------*- Markdown -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+// 
+//===----------------------------------------------------------------------===//-->
+
+# Segment L2 Allocation Example
+
+This example is functionally a data passthrough, but what makes it interesting is that some of the data accessed in a herd is L2 data allocated by a segment. This L2 memory is passed as an argument into the herd.
+
+## Source Files Overview
+
+1. [`segment_alloc.py`](./segment_alloc.py): Python scripts that defines the module design for the example using MLIR-AIR Python bindings. The file generates MLIR that is then compiled using `aircc.py` to produce design binaries (i.e. `XCLBIN` and `inst.txt` for the NPU in Ryzen™ AI). This file also contains the code needed to run the program on the NPU and test the output.
+
+## Usage
+
+### Generate AIR MLIR from Python
+
+Run:
+```bash
+make print
+```
+OR 
+
+```bash
+python segment_alloc.py -p
+```
+
+### Running
+
+To compile and run the design:
+
+```bash
+make
+```
+
+To run with verbose output, either modify the makefile or specify verbose directly with:
+```bash
+python segment_alloc.py -v
+```
diff --git a/programming_examples/shim_dma_2d/README.md b/programming_examples/shim_dma_2d/README.md
index f1af3dbc4..60a15feaa 100644
--- a/programming_examples/shim_dma_2d/README.md
+++ b/programming_examples/shim_dma_2d/README.md
@@ -1,11 +1,16 @@
 # shim_dma_2d
 
-This example demonstrates how data may be moved using shim DMA operations. In this example, a 2-dimensional block of data (referred to in test code as an *image*) is set to have some specific values. The upper corner of the image (referred to in test code as the *tile*) is transferred to a compute core using DMA. The compute core then reads and outputs all the data in the tile. The tile is read back into an output image. When run, the output image is checked to verify that the tile region shows the values from the input image (showing the data transfer was successful) while the remainder of the output image is checked to ensure it retains the original output image values (showing the data is written to the correct tile region in the output image).
+This example demonstrates how data may be moved using shim DMA operations. In this example, a 2-dimensional block of data (referred to in test code as an *image*) is set to have some specific values. The upper corner of the image (referred to in test code as the *tile*) is transferred to a compute core using DMA. The compute core then copies all the data in the tile and sends the data out into an output *image*. When run, the output image is checked to verify that the tile region shows the values from the input image (showing the data transfer was successful) while the remainder of the output image is checked to ensure it retains the original output image values (showing the data is written to the correct tile region in the output image).
 
 The logic in this example is defined in [shim_dma_2d.py](shim_dma_2d.py), and uses Python AIR bindings to generate AIR MLIR.
 
 ## Running and Testing
 
+To generate AIR MLIR from the Python bindings, run:
+```bash
+python shim_dma_2d.py
+```
+
 For illustrative purposes, we provide three different ways to run and test this example. The three approaches are functionally equivalent but the implementation of each approach differs. The general workflow of each is:
 * Build
   * The AIR Python bindings are used to generate AIR MLIR (generally a file called ```air.mlir```)
@@ -19,11 +24,13 @@ For illustrative purposes, we provide three different ways to run and test this
 
 ### Method 1: Run and test with AIR utility functions
 
-This is the cleanest and simplest method of specifying a workflow to run AIR MLIR on an NPU, and uses code in the [run.py](run.py) file. The utility functions greatly simplify setting up input/output data and allow ```aircc.py``` to use a default set of pipelines and passes. For this example, ```aircc.py``` is configured with ```--experimental```, which adds some additional experimental passes to the pipeline with the goal of increased efficiency.
+This is the cleanest and simplest method of specifying a workflow to run AIR MLIR on an NPU, and uses code in the [run.py](run.py) file. The utility class ```XRTRunner``` simplifies setting up input/output data and allows the user to specify the input and an expected output using ```numpy``` ```ndarray```s. Behind the scenes, ```XRTRunner``` calls ```aircc.py``` with the default set of pipelines and passes, but for this and most examples, we use ```experimental_passes``` to signify we also want to run additional passes which should increase efficiency.
 ```bash
 make pyworkflow
 ```
 
+Note that if you'd like to run this example with verbose output - to see all the commands that are run, for example - you can change the ```VERBOSE``` variable in ```run.py``` to ```True```.
+
 ### Method 2: Generate AIR MLIR with python, compile on the command line, and run with python
 
 This method uses the [test.py](test.py) file. While method 1 may be more user-friendly, this method is included as a frame of reference to understand the processes and steps that are abstracted by the AIR XRT backend utility functions used in method 1.