From 688912654423d63adf646d7f21721b034a30219f Mon Sep 17 00:00:00 2001
From: Jack Lo <jack.lo@amd.com>
Date: Thu, 31 Oct 2024 22:42:05 -0600
Subject: [PATCH 1/6] Fix trace for mm. Change default to vec and trace on.

---
 .../basic/matrix_multiplication/single_core/aie2.py           | 4 ++--
 programming_guide/section-4/section-4c/README.md              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 76d5125cba..b0f6b569d5 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -79,7 +79,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
     assert n % t == 0
 
     vectorized = True
-    enable_tracing = False
+    enable_tracing = True
     trace_size = 65536
 
     dtype_in = dtype_map[dtype_in_str]
@@ -109,7 +109,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
 
     with mlir_mod_ctx() as ctx:
 
-        C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize // 8
+        C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize
 
         @device(AIEDevice.npu1_1col)
         def device_body():
diff --git a/programming_guide/section-4/section-4c/README.md b/programming_guide/section-4/section-4c/README.md
index 5ad1102c3a..6ccb0b8d71 100644
--- a/programming_guide/section-4/section-4c/README.md
+++ b/programming_guide/section-4/section-4c/README.md
@@ -206,9 +206,9 @@ Looking at this table, we quickly see that the data movement is the bottleneck f
 
     Mouse over the blocks of PortRuning0 and PortRunning1, what is the measured number of cycles per chunk? <img src="../../../mlir_tutorials/images/answer1.jpg" title="512 cycles" height=25> This matches what we expected to see. But note how it's obvious from the waveform how dominant data movement is as compared to compute. 
 
-1. We can already see that our design is inbalanced between data movement and compute where we have 72 cycles for compute and 512 cycles for data movement. Let's take a look at the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/) and see if we can do better. In the description, it talks about each iteration of the kernel is by default configured for MxKxN values of 64x64x64 giving us 262,144 MACs. Given that we're working with `int16_t` datatype which has 64 MACs per clock, how many cycles will the ideal case take?  <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 262,144/ 64" height=25> Given that the A and B matrix are each 64x64 x `int16_t` and our stream switch channels are are 32-bits wide, how many cycles does it take to move data to the compute tile (bear in mind A and B can be moved in parallel via separate channels). <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 64x64/2" height=25>
+1. We can already see that our design is inbalanced between data movement and compute where we have 72 cycles for compute and 512 cycles for data movement. Let's take a look at the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/single_core) and see if we can do better. In the description, it talks about each iteration of the kernel is by default configured for MxKxN values of 64x64x64 giving us 262,144 MACs. Given that we're working with `int16_t` datatype which has 64 MACs per clock, how many cycles will the ideal case take?  <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 262,144/ 64" height=25> Given that the A and B matrix are each 64x64 x `int16_t` and our stream switch channels are are 32-bits wide, how many cycles does it take to move data to the compute tile (bear in mind A and B can be moved in parallel via separate channels). <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 64x64/2" height=25>
 
-1. So this example should be perfectly balanced between compute and data movement! Navigate to the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/) and run the trace build (`make clean; make trace`). Then open the generated waveform json (`trace_mm.json`) and measure the delta between `event 0` and `event 1` in the first run. What value did you get and how close is it to ideal? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~2535 cycles which is 80% of 2048" height=25> You should now see that both the compute cycles and the data movement cycles are much more closely matched!
+1. So this example should be perfectly balanced between compute and data movement! Navigate to the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/single_core) and run the trace build (`make clean; make -f Makefile.chess trace`). Then open the generated waveform json (`trace_mm.json`) and measure the delta between `event 0` and `event 1` in the first run. What value did you get and how close is it to ideal? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~2535 cycles which is 80% of 2048" height=25> You should now see that both the compute cycles and the data movement cycles are much more closely matched!
 
 ## <u>Diving Deep - Examining the Microcode</u>
 Let's take another look at the results of our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/). Let's also go back one step and comment out `chess_prepare_for_pipelining chess_loop_range(16, )` and rerun the compilation (`make clean; make trace`). 

From 31171eb7b85f8a11c9cd2ecd7f3908597123d2a8 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Fri, 1 Nov 2024 03:03:26 -0600
Subject: [PATCH 2/6] enable_tracing default False

---
 .../basic/matrix_multiplication/single_core/aie2.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index b0f6b569d5..7a3cb54d34 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -79,7 +79,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
     assert n % t == 0
 
     vectorized = True
-    enable_tracing = True
+    enable_tracing = False
     trace_size = 65536
 
     dtype_in = dtype_map[dtype_in_str]

From 2f5f616053fc3d5ecb4120901133b40e15f1cf66 Mon Sep 17 00:00:00 2001
From: Jack Lo <jack.lo@amd.com>
Date: Fri, 1 Nov 2024 14:25:05 -0600
Subject: [PATCH 3/6] Updated trace function calls and made trace_size an run
 time param

---
 .../matrix_multiplication/makefile-common     | 30 ++++++++++---
 .../matrix_multiplication/single_core/aie2.py | 45 +++++--------------
 2 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index 182c0d6a4c..756e80b35c 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -39,6 +39,8 @@ K?=512
 N?=512
 dtype_in?=i16
 dtype_out?=i32
+trace_size?=65536
+
 
 ifeq ($(dtype_in),bf16)
 	dtype_in_cpp=std::bfloat16_t
@@ -70,11 +72,11 @@ ifeq ($(dtype_out),i8)
 	dtype_acc_cpp=int8_t
 endif
 
-trace_size?=65536
-
 target_suffix?=${M}x${K}x${N}
 mlir_target?=build/aie_${target_suffix}.mlir
+trace_mlir_target?=build/aie_trace_${target_suffix}.mlir
 xclbin_target?=build/final_${target_suffix}.xclbin
+trace_xclbin_target?=build/trace_${target_suffix}.xclbin
 insts_target?=build/insts_${target_suffix}.txt
 aie_py_src?=aie2.py
 
@@ -94,7 +96,11 @@ build/%.o: ${kernels_dir}/%.cc
 
 ${mlir_target}: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
-	python3 $< ${aieargs} > $@
+	python3 $< ${aieargs} --trace_size 0 > $@
+
+${trace_mlir_target}: ${srcdir}/${aie_py_src}
+	mkdir -p ${@D}
+	python3 $< ${aieargs} --trace_size ${trace_size} > $@
 
 ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
 	mkdir -p ${@D}
@@ -104,6 +110,14 @@ ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
 				) \
 				--aie-generate-npu --npu-insts-name=${insts_target:build/%=%} $(<:%=../%)
 
+${trace_xclbin_target}: ${trace_mlir_target} ${kernels:%=build/%.o}
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				$(if $(shell [ $(CHESS) != true ] && echo true), \
+					--no-xchesscc --no-xbridge --peano ${PEANO_INSTALL_DIR}, \
+				) \
+				--aie-generate-npu --npu-insts-name=${insts_target:build/%=%} $(<:%=../%)
+
 ${targetname}.exe: ${srcdir}/test.cpp ${srcdir}/../test.cpp ${srcdir}/../common.h
 	rm -rf _build
 	mkdir -p _build
@@ -126,14 +140,16 @@ run: ${targetname}.exe ${xclbin_target}
 	export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
 	${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs}
 
-trace: ${targetname}.exe ${xclbin_target} ${insts_target}
+trace: ${targetname}.exe ${trace_xclbin_target} ${insts_target}
 	export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
-	${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
-	../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json
+	${powershell} ./$< -x ${trace_xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs} -t ${trace_size}
+	../../../utils/parse_trace.py --filename trace.txt --mlir ${trace_mlir_target} --colshift 1 > trace_mm.json
+
+#	${powershell} ./$< -x ${trace_xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
 
 .PHONY: parse_trace
 parse_trace:
-	../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json
+	../../../utils/parse_trace.py --filename trace.txt --mlir ${trace_mlir_target} --colshift 1 > trace_mm.json
 
 .PHONY: clean
 clean: clean_trace
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 7a3cb54d34..801573c1f7 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -45,9 +45,10 @@ def main():
         choices=["bf16", "i8", "i16", "f32", "i32"],
         default="i32",
     )
+    argparser.add_argument("--trace_size", type=int, default=0)
     args = argparser.parse_args()
     my_matmul(
-        args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out
+        args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out, args.trace_size
     )
 
 
@@ -55,7 +56,7 @@ def ceildiv(a, b):
     return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
+def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, trace_size):
 
     assert M % m == 0
     assert K % k == 0
@@ -79,8 +80,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
     assert n % t == 0
 
     vectorized = True
-    enable_tracing = False
-    trace_size = 65536
+    enable_tracing = True if trace_size > 0 else False
 
     dtype_in = dtype_map[dtype_in_str]
     dtype_out = dtype_map[dtype_out_str]
@@ -195,9 +195,10 @@ def device_body():
             )
             object_fifo_link(memC, outC)
 
-            # Set up a circuit-switched flow from core to shim for tracing information
-            if enable_tracing:
-                flow(compute_tile2, WireBundle.Trace, 0, shim_tile, WireBundle.DMA, 1)
+            # Set up a packet-switched flow from core to shim for tracing information
+            tiles_to_trace = [compute_tile2]
+            if trace_size > 0:
+                trace_utils.configure_packet_tracing_flow(tiles_to_trace, shim_tile)
 
             # Set up compute tiles
 
@@ -230,34 +231,8 @@ def core_body():
             def sequence(A, B, C):
 
                 if enable_tracing:
-                    trace_utils.configure_simple_tracing_aie2(
-                        compute_tile2,
-                        shim_tile,
-                        ddr_id=2,
-                        size=trace_size,
-                        offset=C_sz_in_bytes,
-                        events=[
-                            PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_0,
-                                port_number=1,
-                                master=True,
-                            ),
-                            PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_1,
-                                port_number=2,
-                                master=True,
-                            ),
-                            PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_2,
-                                port_number=5,
-                                master=True,
-                            ),
-                            trace_utils.CoreEvent.INSTR_EVENT_0,
-                            trace_utils.CoreEvent.INSTR_EVENT_1,
-                            trace_utils.CoreEvent.MEMORY_STALL,
-                            trace_utils.CoreEvent.LOCK_STALL,
-                            trace_utils.CoreEvent.INSTR_VECTOR,
-                        ],
+                    trace_utils.configure_packet_tracing_aie2(
+                        tiles_to_trace, shim_tile, trace_size, C_sz_in_bytes
                     )
 
                 # only do 4 tile rows at a time before synchronizing, so we can reuse BDs

From 10c08c8c19fb1ce5499818ea17f550e2aeeded6f Mon Sep 17 00:00:00 2001
From: Jack Lo <jack.lo@amd.com>
Date: Fri, 1 Nov 2024 14:27:32 -0600
Subject: [PATCH 4/6] Format fix

---
 .../basic/matrix_multiplication/single_core/aie2.py    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 801573c1f7..e0a02e00f4 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -48,7 +48,15 @@ def main():
     argparser.add_argument("--trace_size", type=int, default=0)
     args = argparser.parse_args()
     my_matmul(
-        args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out, args.trace_size
+        args.M,
+        args.K,
+        args.N,
+        args.m,
+        args.k,
+        args.n,
+        args.dtype_in,
+        args.dtype_out,
+        args.trace_size,
     )
 
 

From a4601ec7f6598e490ccc208f3924dcc47e4e7dbd Mon Sep 17 00:00:00 2001
From: Jack Lo <jack.lo@amd.com>
Date: Sat, 2 Nov 2024 20:03:06 -0600
Subject: [PATCH 5/6] Add trace_size arg

---
 .../matrix_multiplication/cascade/aie2.py     |  4 +-
 .../single_core/aie2_alt.py                   | 55 +++++++------------
 .../matrix_multiplication/whole_array/aie2.py |  4 +-
 3 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/cascade/aie2.py b/programming_examples/basic/matrix_multiplication/cascade/aie2.py
index 8f47f70552..209009e520 100644
--- a/programming_examples/basic/matrix_multiplication/cascade/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/cascade/aie2.py
@@ -41,6 +41,7 @@ def main():
     argparser.add_argument(
         "--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i32"
     )
+    argparser.add_argument("--trace_size", type=int, default=0)
     args = argparser.parse_args()
     with mlir_mod_ctx() as ctx:
         my_matmul(
@@ -53,6 +54,7 @@ def main():
             args.n_aie_cols,
             args.dtype_in,
             args.dtype_out,
+            args.trace_size,
         )
         # print(ctx.module.operation.verify())
         print(ctx.module)
@@ -62,7 +64,7 @@ def ceildiv(a, b):
     return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str):
+def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, trace_size):
 
     n_aie_rows = 4
     n_aie_cores = n_aie_rows * n_aie_cols
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2_alt.py b/programming_examples/basic/matrix_multiplication/single_core/aie2_alt.py
index df69dd0ade..a1ebc0acde 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2_alt.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2_alt.py
@@ -49,9 +49,18 @@ def main():
         choices=["bf16", "i8", "i16", "f32", "i32"],
         default="i32",
     )
+    argparser.add_argument("--trace_size", type=int, default=0)
     args = argparser.parse_args()
     my_matmul(
-        args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out
+        args.M,
+        args.K,
+        args.N,
+        args.m,
+        args.k,
+        args.n,
+        args.dtype_in,
+        args.dtype_out,
+        args.trace_size,
     )
 
 
@@ -59,7 +68,7 @@ def ceildiv(a, b):
     return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
+def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, trace_size):
 
     assert M % m == 0
     assert K % k == 0
@@ -83,8 +92,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
     assert n % t == 0
 
     vectorized = True
-    enable_tracing = False
-    trace_size = 65536
+    enable_tracing = True if trace_size > 0 else False
 
     dtype_in = dtype_map[dtype_in_str]
     dtype_out = dtype_map[dtype_out_str]
@@ -113,7 +121,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
 
     with mlir_mod_ctx() as ctx:
 
-        C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize // 8
+        C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize
 
         @device(AIEDevice.npu1_1col)
         def device_body():
@@ -199,9 +207,10 @@ def device_body():
             )
             object_fifo_link(memC, outC)
 
-            # Set up a circuit-switched flow from core to shim for tracing information
-            if enable_tracing:
-                flow(compute_tile2, WireBundle.Trace, 0, shim_tile, WireBundle.DMA, 1)
+            # Set up a packet-switched flow from core to shim for tracing information
+            tiles_to_trace = [compute_tile2]
+            if trace_size > 0:
+                trace_utils.configure_packet_tracing_flow(tiles_to_trace, shim_tile)
 
             # Set up compute tiles
 
@@ -233,34 +242,8 @@ def core_body():
             def sequence(A, B, C):
 
                 if enable_tracing:
-                    trace_utils.configure_simple_tracing_aie2(
-                        compute_tile2,
-                        shim_tile,
-                        ddr_id=2,
-                        size=trace_size,
-                        offset=C_sz_in_bytes,
-                        events=[
-                            PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_0,
-                                port_number=1,
-                                master=True,
-                            ),
-                            PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_1,
-                                port_number=2,
-                                master=True,
-                            ),
-                            PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_2,
-                                port_number=5,
-                                master=True,
-                            ),
-                            trace_utils.CoreEvent.INSTR_EVENT_0,
-                            trace_utils.CoreEvent.INSTR_EVENT_1,
-                            trace_utils.CoreEvent.MEMORY_STALL,
-                            trace_utils.CoreEvent.LOCK_STALL,
-                            trace_utils.CoreEvent.INSTR_VECTOR,
-                        ],
+                    trace_utils.configure_packet_tracing_aie2(
+                        tiles_to_trace, shim_tile, trace_size, C_sz_in_bytes
                     )
 
                 # These lists will hold handles to the DMA tasks we configure
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 7482355288..bc100eea2b 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -46,6 +46,7 @@ def main():
         choices=["bf16", "i8", "i16", "f32", "i32"],
         default="i16",
     )
+    argparser.add_argument("--trace_size", type=int, default=0)
     args = argparser.parse_args()
     with mlir_mod_ctx() as ctx:
         my_matmul(
@@ -59,6 +60,7 @@ def main():
             args.dtype_in,
             args.dtype_out,
             args.b_col_maj,
+            args.trace_size,
         )
         # print(ctx.module.operation.verify())
         print(ctx.module)
@@ -68,7 +70,7 @@ def ceildiv(a, b):
     return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, b_col_maj):
+def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, b_col_maj, trace_size):
 
     n_aie_rows = 4
     n_aie_cores = n_aie_rows * n_aie_cols

From c394fa8a6e917044f5ae852b58e27747e48205a0 Mon Sep 17 00:00:00 2001
From: Jack Lo <jack.lo@amd.com>
Date: Sat, 2 Nov 2024 20:08:01 -0600
Subject: [PATCH 6/6] format fix

---
 .../basic/matrix_multiplication/whole_array/aie2.py           | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index bc100eea2b..ce3852eb18 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -70,7 +70,9 @@ def ceildiv(a, b):
     return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, b_col_maj, trace_size):
+def my_matmul(
+    M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, b_col_maj, trace_size
+):
 
     n_aie_rows = 4
     n_aie_cores = n_aie_rows * n_aie_cols