From e5601b2305231ab46bfe7be1255804eabd794e0d Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Sun, 15 Sep 2024 21:55:41 -0500
Subject: [PATCH 1/5] Add performance reference for important matmul kernels

---
 .../benchmarking_results_ref@709e3fdbf.csv    | 14 +++++++++++
 .../tools/tune_gemm/database.yaml             | 18 ++++++++++++++
 .../tools/tune_gemm/matmul_kernel.py          | 24 +++++++++++++------
 .../perf-kernels/tools/tune_gemm/tune_gemm.py |  8 ++++---
 4 files changed, 54 insertions(+), 10 deletions(-)
 create mode 100644 python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv
 create mode 100644 python/perf-kernels/tools/tune_gemm/database.yaml

diff --git a/python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv b/python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv
new file mode 100644
index 000000000000..dec37e4de94f
--- /dev/null
+++ b/python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv
@@ -0,0 +1,14 @@
+trans,M,N,K,TFLOPS,us
+TN,4864,4096,4096,467.39,349.19
+TN,4864,4096,4160,567.17,292.26
+TN,4864,4096,4224,557.49,301.90
+TN,4864,4096,4288,569.55,299.99
+TN,4864,4096,4097,501.58,325.47
+TN,4864,4096,4098,491.96,331.92
+TN,4864,4096,4100,503.51,324.46
+TN,4864,4096,4104,515.70,317.10
+TN,4864,4096,4112,525.66,311.70
+TN,4864,8192,4096,519.95,627.79
+TN,4864,8192,4160,579.14,572.43
+TN,4864,8192,8192,543.30,1201.6
+TN,4864,8192,8256,563.43,1167.7
diff --git a/python/perf-kernels/tools/tune_gemm/database.yaml b/python/perf-kernels/tools/tune_gemm/database.yaml
new file mode 100644
index 000000000000..33c892f3cdde
--- /dev/null
+++ b/python/perf-kernels/tools/tune_gemm/database.yaml
@@ -0,0 +1,18 @@
+# M // BLOCK_M * N // BLOCK_N % 304 == 0
+## 1 workgroup / CU
+- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+## 1 workgroup / CU masked loadK
+- {'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+
+## 2 workgroups / CU
+- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
diff --git a/python/perf-kernels/tools/tune_gemm/matmul_kernel.py b/python/perf-kernels/tools/tune_gemm/matmul_kernel.py
index 1d9902bc2de6..6491340d10d9 100644
--- a/python/perf-kernels/tools/tune_gemm/matmul_kernel.py
+++ b/python/perf-kernels/tools/tune_gemm/matmul_kernel.py
@@ -46,16 +46,26 @@ def matmul_kernel(a_ptr, b_ptr, c_ptr, bias_ptr, M, N, K, stride_am, stride_ak,
         bias = tl.load(bias_ptrs, mask=offs_am < M, other=0.0)
     acc_dtype = tl.float32 if a_ptr.type.element_ty != tl.int8 else tl.int32
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
-        if EVEN_K:
-            a = tl.load(a_ptrs)
-            b = tl.load(b_ptrs)
-        else:
-            a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
-            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+
+    max_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) - 1
+    if EVEN_K:
+        max_k += 1
+    for k in range(0, max_k):
+        a = tl.load(tl.multiple_of(a_ptrs, (1, 16)))
+        b = tl.load(tl.multiple_of(b_ptrs, (16, 1)))
         accumulator += tl.dot(a, b)
         a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk
+
+    if not EVEN_K:
+        k = max_k
+        offs_k = k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+        a_ptrsX = a_ptr + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+        b_ptrsX = b_ptr + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+        a = tl.load(a_ptrsX, mask=offs_k[None, :] < K, other=0.0)
+        b = tl.load(b_ptrsX, mask=offs_k[:, None] < K, other=0.0)
+        accumulator += tl.dot(a, b)
+
     c = accumulator.to(c_ptr.type.element_ty)
     if BIAS:
         c += bias[:, None]
diff --git a/python/perf-kernels/tools/tune_gemm/tune_gemm.py b/python/perf-kernels/tools/tune_gemm/tune_gemm.py
index 291096b3d7af..b3d0d0bccc12 100755
--- a/python/perf-kernels/tools/tune_gemm/tune_gemm.py
+++ b/python/perf-kernels/tools/tune_gemm/tune_gemm.py
@@ -108,6 +108,7 @@ def prune_configs(M, N, K, configs, elemBytes_a, elemBytes_b):
         num_warps = config.get("num_warps")
         num_stages = config.get("num_stages")
         matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+        EVEN_K = (K % BLOCK_SIZE_K == 0)
         if matrix_instr_nonkdim > mfma:
             continue
         if mfma == 4 and BLOCK_SIZE_K < 64:
@@ -149,10 +150,11 @@ def prune_configs(M, N, K, configs, elemBytes_a, elemBytes_b):
             continue
         # Skip small block sizes and num_warps for large gemm
         # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        # We only want to use a small BLOCK_SIZE_K if not EVEN_K
         if large_gemm:
             if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
                 continue
-            if BLOCK_SIZE_K < 64:
+            if BLOCK_SIZE_K < 64 and EVEN_K:
                 continue
             if num_warps < 4:
                 continue
@@ -657,14 +659,14 @@ def main():
 
         # write best config to tuning_results.yaml
         if run_bench:
-            print(f"{formatted_tflops}     {minTime}")
+            print(f"{formatted_tflops}     {minTime}   {bestConfig_compact_str}")
             f_results.write(f"{formatted_tflops},{minTime}\n")
 
         sizeDict = {'M': M, 'N': N, 'K': K, 'rowMajorA': row_a_str, 'rowMajorB': row_b_str}
         sizeDict.update(bestConfig)
         if not run_bench:
             f_results.write("- " + str(sizeDict) + " ")
-            f_results.write(f'# TFLOPS: {formatted_tflops} time(us): {minTime}\n')
+            f_results.write(f'# {bestConfig_compact_str}\n')
 
         # remove generated files if asked to
         if not keepTmp:

From 92a8ae4d746d33bfbed57b80e7da13715dd69177 Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Mon, 30 Sep 2024 21:43:36 -0500
Subject: [PATCH 2/5] add config for fp16

---
 .../tools/tune_gemm/config_fp16.yaml          | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 python/perf-kernels/tools/tune_gemm/config_fp16.yaml

diff --git a/python/perf-kernels/tools/tune_gemm/config_fp16.yaml b/python/perf-kernels/tools/tune_gemm/config_fp16.yaml
new file mode 100644
index 000000000000..ae9730b842cb
--- /dev/null
+++ b/python/perf-kernels/tools/tune_gemm/config_fp16.yaml
@@ -0,0 +1,24 @@
+
+
+## 2 workgroups / CU
+#- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+#- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 8320, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 8384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+#- {'M': 4864, 'N': 8192, 'K': 12288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 12352, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 12416, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 4864, 'N': 8192, 'K': 12480, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+
+- {'M': 9728, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 9728, 'N': 8192, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 9728, 'N': 8192, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+
+
+- {'M': 9728, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 9728, 'N': 8192, 'K': 8320, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
+- {'M': 9728, 'N': 8192, 'K': 8384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}

From 223a5abdad1cde9d749e3ef11775f123ddb18992 Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Tue, 24 Sep 2024 14:49:20 -0500
Subject: [PATCH 3/5] Add a script to process json files

---
 .../tools/tune_gemm/process_json.py           | 234 ++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 python/perf-kernels/tools/tune_gemm/process_json.py

diff --git a/python/perf-kernels/tools/tune_gemm/process_json.py b/python/perf-kernels/tools/tune_gemm/process_json.py
new file mode 100644
index 000000000000..9a473b9e90f8
--- /dev/null
+++ b/python/perf-kernels/tools/tune_gemm/process_json.py
@@ -0,0 +1,234 @@
+import numpy as np
+from statistics import mean
+import argparse
+import sys
+import json
+import os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="tune a specific gemm size",
+        allow_abbrev=False,
+    )
+
+    parser.add_argument("-d", type=str, default="", help='*_ui dir')
+    parser.add_argument("-se", type=int, default=0, help="")
+    parser.add_argument("-sm", type=int, default=0, help="")
+    parser.add_argument("-sl", type=int, default=0, help="")
+    parser.add_argument("-wv", type=int, default=-1, help="")
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_trace(code_fullname, trace_fullname):
+    instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt = gen_all_clk(
+        code_fullname, trace_fullname)
+    pro, loop, epi, iter_clk = gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk)
+    bar1_lat, bar2_lat = gen_fine_clk(bar1_clk, bar2_clk, bar3_clk)
+
+    lat1, lat2, lat_sum, idle1, idle2 = print_loop_eff(bar1_lat, bar2_lat, mfma_dsRead_cnt, mfma_dsWrite_cnt)
+    return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2
+
+
+def print_list(myList):
+    for i in range(len(myList)):
+        print(myList[i])
+
+
+def gen_all_clk(code_fullname, trace_fullname):
+    if not os.path.isfile(trace_fullname):
+        print(f"trace file not found {trace_fullname}")
+        return
+
+    marker_to_line = dict()
+    marker_to_line['firstInstr'] = 2
+    marker_barrier = list()
+
+    ## Read code.json to get instruction idx
+    with open(code_fullname) as code_f:
+        code_data = json.load(code_f)
+        code_list = code_data['code']
+
+        found_1st_barrier = False
+        mfma_cnt = 0
+        should_cnt = False
+        ## Find the s_barriers
+        for i in range(len(code_list)):
+            if "s_barrier" in code_list[i][0]:
+                marker_barrier.append(code_list[i])
+                if not found_1st_barrier:
+                    ## This is barrier1
+                    found_1st_barrier = True
+                    should_cnt = True
+                else:
+                    ## This is barrier2 or barrier3
+                    should_cnt = False
+            if "mfma" in code_list[i][0] and should_cnt:
+                mfma_cnt += 1
+
+    mfma_dsRead_cnt = mfma_cnt
+    mfma_dsWrite_cnt = 128 - mfma_cnt
+
+    if len(marker_barrier) != 3:
+        print(f"Not 3 barriers?? Found {len(marker_barrier)}")
+        exit(0)
+    marker_to_line['barrier_before_ds_read'] = marker_barrier[0][2]
+    marker_to_line['instrAfterBarrier1'] = marker_barrier[0][2] + 1
+    marker_to_line['barrier_before_ds_write'] = marker_barrier[1][2]
+    marker_to_line['instrAfterBarrier2'] = marker_barrier[1][2] + 1
+    marker_to_line['barrier_after_loop'] = marker_barrier[2][2]
+    marker_to_line['instrAfterBarrier3'] = marker_barrier[2][2] + 1
+
+    instrAfterBarrier1_clk = list()
+    instrAfterBarrier2_clk = list()
+    instrAfterBarrier3_clk = 0
+    firstInstr_clk = 0
+    lastInstr_clk = 0
+
+    ## Read trace to get clk info for the markers
+    with open(trace_fullname) as trace_f:
+        trace_data = json.load(trace_f)
+        trace_list = trace_data['wave']['instructions']
+
+        for i in range(len(trace_list)):
+            ## Capture the clk for the first instruction in the kernel
+            if trace_list[i][-1] == marker_to_line['firstInstr']:
+                firstInstr_clk = trace_list[i][0]
+            ## Capture barrier1
+            if trace_list[i][-1] == marker_to_line['instrAfterBarrier1']:
+                instrAfterBarrier1_clk.append(trace_list[i][0])
+            ## Capture barrier2
+            if trace_list[i][-1] == marker_to_line['instrAfterBarrier2']:
+                instrAfterBarrier2_clk.append(trace_list[i][0])
+            ## Capture barrier3
+            if trace_list[i][-1] == marker_to_line['instrAfterBarrier3']:
+                instrAfterBarrier3_clk = trace_list[i][0]
+        lastInstr_clk = trace_list[-1][0]
+
+    if len(instrAfterBarrier1_clk) != len(instrAfterBarrier2_clk):
+        print("different length of instrAfterBarrier1_clk and instrAfterBarrier2_clk")
+        exit(0)
+
+    return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt
+
+
+def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk):
+    prologue = bar1_clk[0] - instr0_clk
+    loop = bar3_clk - bar1_clk[0]
+    epilogue = instr9_clk - bar3_clk
+    clk_per_iter = loop / len(bar1_clk)
+    return prologue, loop, epilogue, clk_per_iter
+
+
+def gen_max_wid(code_fullname):
+    code_f = open(code_fullname)
+    code_data = json.load(code_f)
+    num_wv = code_data['code'][2][-2]
+    return int(num_wv / 8)
+
+
+def gen_fine_clk(bar1_clk, bar2_clk, bar3_clk):
+    bar1_lat = list()
+    bar2_lat = list()
+    for i in range(len(bar1_clk)):
+        bar1_lat.append(bar2_clk[i] - bar1_clk[i])
+        if i + 1 == len(bar1_clk):
+            bar2_lat.append(bar3_clk - bar2_clk[i])
+        else:
+            bar2_lat.append(bar1_clk[i + 1] - bar2_clk[i])
+
+    return bar1_lat, bar2_lat
+
+
+def list_to_stat(myList):
+    ave = mean(myList)
+    maxVal = max(myList)
+    minVal = min(myList)
+    stdVal = np.std(myList)
+    return int(ave), maxVal, minVal, stdVal
+
+
+def print_loop_eff(list1, list2, cnt1, cnt2):
+    if len(list1) != len(list2):
+        print("lists do not have the same length!!")
+        exit(0)
+
+    ave1, max1, min1, stddev1 = list_to_stat(list1)
+    ave2, max2, min2, stddev2 = list_to_stat(list2)
+
+    return ave1, ave2, ave1 + ave2, ave1 - cnt1 * 2 * 16, ave2 - cnt2 * 2 * 16
+
+
+def calc_global_store_cycles(code_fullname):
+    f = open(code_fullname)
+    data = json.load(f)
+    idx = 0
+    saw_store = False
+    total_mem = 0
+    total_hitcnt = 0
+    vmcnt_cnt = 0
+    total_iss = 0
+    total_iss_cnt = 0
+    for i in data["code"]:
+        if "global_store" in i[0]:
+            global_store_name = i[0].split()[0]
+            total_iss += i[-1] / i[-2]
+            total_iss_cnt += i[-2]
+            saw_store = True
+            idx += 1
+        if saw_store and "vmcnt(0)" in i[0]:
+            hitcnt = i[-2]
+            total_mem += i[-1] / i[-2]
+            total_hitcnt += hitcnt
+            vmcnt_cnt += 1
+
+    print(f"{idx} {global_store_name}  {vmcnt_cnt} vmcnt(0)")
+    print(f"total cycles: {total_iss+total_mem:.0f} = {total_iss:.0f}(iss) + {total_mem:.0f}(mem)")
+    print(f"{total_iss/idx:.1f} issue cycles per {global_store_name}")
+    print(f"{total_mem/vmcnt_cnt:.1f} cycles per vmcnt(0)")
+
+
+def main():
+    args = parse_args()
+    trace_dir = args.d
+    se = args.se
+    sm = args.sm
+    sl = args.sl
+    wv = args.wv
+
+    code_filename = "code.json"
+    code_fullname = os.path.join(trace_dir, code_filename)
+    trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wv}.json"
+    trace_fullname = os.path.join(trace_dir, trace_filename)
+    maxwid = gen_max_wid(code_fullname)
+
+    print("wid,prologue,loop,epilogue,iter_clk,lat1,lat2,iter_lat,idle1,idle2")
+    epi_total = 0
+    epi_1st = 0
+    flag = False
+    cnt = 0
+    for wid in range(maxwid):
+        if wv != -1 and wid != wv:
+            continue
+        trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json"
+        trace_fullname = os.path.join(trace_dir, trace_filename)
+        pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2 = parse_trace(code_fullname, trace_fullname)
+        print(f"{wid},{pro},{loop},{epi},{iter_clk:.0f},{lat1},{lat2},{lat_sum},{idle1},{idle2}")
+        if not flag:
+            epi_1st = epi
+            flag = False
+
+        if epi > 2 * epi_1st:
+            continue
+        epi_total += epi
+        cnt += 1
+
+    print(f"averaged epilogue cycles: {epi_total / cnt:.0f}")
+    print(f"global_store info (averaged for all {maxwid} waves):")
+    calc_global_store_cycles(code_fullname)
+
+
+if __name__ == '__main__':
+    sys.exit(main())

From 995144fe597e6f433309da589084aad93bec66e4 Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Mon, 30 Sep 2024 20:50:48 -0500
Subject: [PATCH 4/5] Recover/detect abnormal traces

---
 .../tools/tune_gemm/process_json.py           | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/perf-kernels/tools/tune_gemm/process_json.py b/python/perf-kernels/tools/tune_gemm/process_json.py
index 9a473b9e90f8..c762da69ac79 100644
--- a/python/perf-kernels/tools/tune_gemm/process_json.py
+++ b/python/perf-kernels/tools/tune_gemm/process_json.py
@@ -23,13 +23,15 @@ def parse_args():
 
 
 def parse_trace(code_fullname, trace_fullname):
-    instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt = gen_all_clk(
+    instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete = gen_all_clk(
         code_fullname, trace_fullname)
+    if incomplete:
+        return 0, 0, 0, 0, 0, 0, 0, 0, 0, incomplete
     pro, loop, epi, iter_clk = gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk)
     bar1_lat, bar2_lat = gen_fine_clk(bar1_clk, bar2_clk, bar3_clk)
 
     lat1, lat2, lat_sum, idle1, idle2 = print_loop_eff(bar1_lat, bar2_lat, mfma_dsRead_cnt, mfma_dsWrite_cnt)
-    return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2
+    return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete
 
 
 def print_list(myList):
@@ -107,11 +109,19 @@ def gen_all_clk(code_fullname, trace_fullname):
                 instrAfterBarrier3_clk = trace_list[i][0]
         lastInstr_clk = trace_list[-1][0]
 
+    incomplete = False
     if len(instrAfterBarrier1_clk) != len(instrAfterBarrier2_clk):
         print("different length of instrAfterBarrier1_clk and instrAfterBarrier2_clk")
-        exit(0)
+        incomplete = True
+
+    len1 = len(instrAfterBarrier1_clk)
+    len2 = len(instrAfterBarrier2_clk)
+    len3 = instrAfterBarrier3_clk
+
+    if len1 == 0 or len2 == 0 or len3 == 0:
+        incomplete = True
 
-    return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt
+    return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete
 
 
 def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk):
@@ -207,6 +217,7 @@ def main():
     print("wid,prologue,loop,epilogue,iter_clk,lat1,lat2,iter_lat,idle1,idle2")
     epi_total = 0
     epi_1st = 0
+    total = 0
     flag = False
     cnt = 0
     for wid in range(maxwid):
@@ -214,7 +225,9 @@ def main():
             continue
         trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json"
         trace_fullname = os.path.join(trace_dir, trace_filename)
-        pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2 = parse_trace(code_fullname, trace_fullname)
+        pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete = parse_trace(code_fullname, trace_fullname)
+        if incomplete:
+            continue
         print(f"{wid},{pro},{loop},{epi},{iter_clk:.0f},{lat1},{lat2},{lat_sum},{idle1},{idle2}")
         if not flag:
             epi_1st = epi
@@ -223,9 +236,13 @@ def main():
         if epi > 2 * epi_1st:
             continue
         epi_total += epi
+        total += epi + pro + loop
         cnt += 1
 
+    if cnt == 0:
+        exit(0)
     print(f"averaged epilogue cycles: {epi_total / cnt:.0f}")
+    print(f"averaged total cycles: {total / cnt:.0f}")
     print(f"global_store info (averaged for all {maxwid} waves):")
     calc_global_store_cycles(code_fullname)
 

From d47adcede88ab79d590d0fdad27672137fd883ee Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 4 Oct 2024 11:18:51 -0500
Subject: [PATCH 5/5] Fix mfma cnt

---
 .../tools/tune_gemm/process_json.py           | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/perf-kernels/tools/tune_gemm/process_json.py b/python/perf-kernels/tools/tune_gemm/process_json.py
index c762da69ac79..aa2d478d7902 100644
--- a/python/perf-kernels/tools/tune_gemm/process_json.py
+++ b/python/perf-kernels/tools/tune_gemm/process_json.py
@@ -54,7 +54,8 @@ def gen_all_clk(code_fullname, trace_fullname):
         code_list = code_data['code']
 
         found_1st_barrier = False
-        mfma_cnt = 0
+        mfma_dsRead_cnt = 0
+        mfma_cnt_total = 0
         should_cnt = False
         ## Find the s_barriers
         for i in range(len(code_list)):
@@ -68,10 +69,14 @@ def gen_all_clk(code_fullname, trace_fullname):
                     ## This is barrier2 or barrier3
                     should_cnt = False
             if "mfma" in code_list[i][0] and should_cnt:
-                mfma_cnt += 1
+                mfma_dsRead_cnt += 1
+            if "mfma" in code_list[i][0]:
+                mfma_cnt_total += 1
 
-    mfma_dsRead_cnt = mfma_cnt
-    mfma_dsWrite_cnt = 128 - mfma_cnt
+    ## /= 2 because the last iteration of local_load and tt.dot
+    ## is peeled off by stream-pipeliner
+    mfma_cnt_total /= 2
+    mfma_dsWrite_cnt = mfma_cnt_total - mfma_dsRead_cnt
 
     if len(marker_barrier) != 3:
         print(f"Not 3 barriers?? Found {len(marker_barrier)}")
@@ -121,7 +126,16 @@ def gen_all_clk(code_fullname, trace_fullname):
     if len1 == 0 or len2 == 0 or len3 == 0:
         incomplete = True
 
-    return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete
+    #print(f"{firstInstr_clk}")
+    #print(f"{instrAfterBarrier1_clk}")
+    #print(f"{instrAfterBarrier2_clk}")
+    #print(f"{instrAfterBarrier3_clk}")
+    #print(f"{lastInstr_clk}")
+    #print(f"{mfma_dsRead_cnt}")
+    #print(f"{mfma_dsWrite_cnt}")
+    #print(f"{incomplete}")
+
+    return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, int(mfma_dsWrite_cnt), incomplete
 
 
 def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk):
@@ -225,6 +239,9 @@ def main():
             continue
         trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json"
         trace_fullname = os.path.join(trace_dir, trace_filename)
+        if not os.path.isfile(trace_fullname):
+            #print(f"trace file not found {trace_fullname}")
+            return
         pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete = parse_trace(code_fullname, trace_fullname)
         if incomplete:
             continue