From e5601b2305231ab46bfe7be1255804eabd794e0d Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Sun, 15 Sep 2024 21:55:41 -0500 Subject: [PATCH 1/5] Add performance reference for important matmul kernels --- .../benchmarking_results_ref@709e3fdbf.csv | 14 +++++++++++ .../tools/tune_gemm/database.yaml | 18 ++++++++++++++ .../tools/tune_gemm/matmul_kernel.py | 24 +++++++++++++------ .../perf-kernels/tools/tune_gemm/tune_gemm.py | 8 ++++--- 4 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv create mode 100644 python/perf-kernels/tools/tune_gemm/database.yaml diff --git a/python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv b/python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv new file mode 100644 index 000000000000..dec37e4de94f --- /dev/null +++ b/python/perf-kernels/tools/tune_gemm/benchmarking_results_ref@709e3fdbf.csv @@ -0,0 +1,14 @@ +trans,M,N,K,TFLOPS,us +TN,4864,4096,4096,467.39,349.19 +TN,4864,4096,4160,567.17,292.26 +TN,4864,4096,4224,557.49,301.90 +TN,4864,4096,4288,569.55,299.99 +TN,4864,4096,4097,501.58,325.47 +TN,4864,4096,4098,491.96,331.92 +TN,4864,4096,4100,503.51,324.46 +TN,4864,4096,4104,515.70,317.10 +TN,4864,4096,4112,525.66,311.70 +TN,4864,8192,4096,519.95,627.79 +TN,4864,8192,4160,579.14,572.43 +TN,4864,8192,8192,543.30,1201.6 +TN,4864,8192,8256,563.43,1167.7 diff --git a/python/perf-kernels/tools/tune_gemm/database.yaml b/python/perf-kernels/tools/tune_gemm/database.yaml new file mode 100644 index 000000000000..33c892f3cdde --- /dev/null +++ b/python/perf-kernels/tools/tune_gemm/database.yaml @@ -0,0 +1,18 @@ +# M // BLOCK_M * N // BLOCK_N % 304 == 0 +## 1 workgroup / CU +- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +## 1 workgroup / CU masked loadK +- {'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} + +## 2 workgroups / CU +- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} diff --git a/python/perf-kernels/tools/tune_gemm/matmul_kernel.py b/python/perf-kernels/tools/tune_gemm/matmul_kernel.py index 1d9902bc2de6..6491340d10d9 100644 --- a/python/perf-kernels/tools/tune_gemm/matmul_kernel.py +++ b/python/perf-kernels/tools/tune_gemm/matmul_kernel.py @@ -46,16 +46,26 @@ def matmul_kernel(a_ptr, b_ptr, c_ptr, bias_ptr, M, N, K, stride_am, stride_ak, bias = tl.load(bias_ptrs, mask=offs_am < M, other=0.0) acc_dtype = tl.float32 if a_ptr.type.element_ty != tl.int8 else tl.int32 accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) - for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)): - if EVEN_K: - a = tl.load(a_ptrs) - b = tl.load(b_ptrs) - else: - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + max_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) - 1 + if EVEN_K: + max_k += 1 + for k in range(0, max_k): + a = tl.load(tl.multiple_of(a_ptrs, (1, 16))) + b = tl.load(tl.multiple_of(b_ptrs, (16, 1))) accumulator += tl.dot(a, b) a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk + + if not EVEN_K: + k = max_k + offs_k = k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + a_ptrsX = a_ptr + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak + b_ptrsX = b_ptr + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn + a = tl.load(a_ptrsX, mask=offs_k[None, :] < K, other=0.0) + b = tl.load(b_ptrsX, mask=offs_k[:, None] < K, other=0.0) + accumulator += tl.dot(a, b) + c = accumulator.to(c_ptr.type.element_ty) if BIAS: c += bias[:, None] diff --git a/python/perf-kernels/tools/tune_gemm/tune_gemm.py b/python/perf-kernels/tools/tune_gemm/tune_gemm.py index 291096b3d7af..b3d0d0bccc12 100755 --- a/python/perf-kernels/tools/tune_gemm/tune_gemm.py +++ b/python/perf-kernels/tools/tune_gemm/tune_gemm.py @@ -108,6 +108,7 @@ def prune_configs(M, N, K, configs, elemBytes_a, elemBytes_b): num_warps = config.get("num_warps") num_stages = config.get("num_stages") matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") + EVEN_K = (K % BLOCK_SIZE_K == 0) if matrix_instr_nonkdim > mfma: continue if mfma == 4 and BLOCK_SIZE_K < 64: @@ -149,10 +150,11 @@ def prune_configs(M, N, K, configs, elemBytes_a, elemBytes_b): continue # Skip small block sizes and num_warps for large gemm # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 + # We only want to use a small BLOCK_SIZE_K if not EVEN_K if large_gemm: if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: continue - if BLOCK_SIZE_K < 64: + if BLOCK_SIZE_K < 64 and EVEN_K: continue if num_warps < 4: continue @@ -657,14 +659,14 @@ def main(): # write best config to tuning_results.yaml if run_bench: - print(f"{formatted_tflops} {minTime}") + print(f"{formatted_tflops} {minTime} {bestConfig_compact_str}") f_results.write(f"{formatted_tflops},{minTime}\n") sizeDict = {'M': M, 'N': N, 'K': K, 'rowMajorA': row_a_str, 'rowMajorB': row_b_str} sizeDict.update(bestConfig) if not run_bench: f_results.write("- " + str(sizeDict) + " ") - f_results.write(f'# TFLOPS: {formatted_tflops} time(us): {minTime}\n') + f_results.write(f'# {bestConfig_compact_str}\n') # remove generated files if asked to if not keepTmp: From 92a8ae4d746d33bfbed57b80e7da13715dd69177 Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Mon, 30 Sep 2024 21:43:36 -0500 Subject: [PATCH 2/5] add config for fp16 --- .../tools/tune_gemm/config_fp16.yaml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 python/perf-kernels/tools/tune_gemm/config_fp16.yaml diff --git a/python/perf-kernels/tools/tune_gemm/config_fp16.yaml b/python/perf-kernels/tools/tune_gemm/config_fp16.yaml new file mode 100644 index 000000000000..ae9730b842cb --- /dev/null +++ b/python/perf-kernels/tools/tune_gemm/config_fp16.yaml @@ -0,0 +1,24 @@ + + +## 2 workgroups / CU +#- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +#- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 8320, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 8384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +#- {'M': 4864, 'N': 8192, 'K': 12288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 12352, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 12416, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 4864, 'N': 8192, 'K': 12480, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} + +- {'M': 9728, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 9728, 'N': 8192, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 9728, 'N': 8192, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} + + +- {'M': 9728, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 9728, 'N': 8192, 'K': 8320, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} +- {'M': 9728, 'N': 8192, 'K': 8384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2} From 223a5abdad1cde9d749e3ef11775f123ddb18992 Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Tue, 24 Sep 2024 14:49:20 -0500 Subject: [PATCH 3/5] Add a script to process json files --- .../tools/tune_gemm/process_json.py | 234 ++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 python/perf-kernels/tools/tune_gemm/process_json.py diff --git a/python/perf-kernels/tools/tune_gemm/process_json.py b/python/perf-kernels/tools/tune_gemm/process_json.py new file mode 100644 index 000000000000..9a473b9e90f8 --- /dev/null +++ b/python/perf-kernels/tools/tune_gemm/process_json.py @@ -0,0 +1,234 @@ +import numpy as np +from statistics import mean +import argparse +import sys +import json +import os + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="tune a specific gemm size", + allow_abbrev=False, + ) + + parser.add_argument("-d", type=str, default="", help='*_ui dir') + parser.add_argument("-se", type=int, default=0, help="") + parser.add_argument("-sm", type=int, default=0, help="") + parser.add_argument("-sl", type=int, default=0, help="") + parser.add_argument("-wv", type=int, default=-1, help="") + + args = parser.parse_args() + return args + + +def parse_trace(code_fullname, trace_fullname): + instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt = gen_all_clk( + code_fullname, trace_fullname) + pro, loop, epi, iter_clk = gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk) + bar1_lat, bar2_lat = gen_fine_clk(bar1_clk, bar2_clk, bar3_clk) + + lat1, lat2, lat_sum, idle1, idle2 = print_loop_eff(bar1_lat, bar2_lat, mfma_dsRead_cnt, mfma_dsWrite_cnt) + return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2 + + +def print_list(myList): + for i in range(len(myList)): + print(myList[i]) + + +def gen_all_clk(code_fullname, trace_fullname): + if not os.path.isfile(trace_fullname): + print(f"trace file not found {trace_fullname}") + return + + marker_to_line = dict() + marker_to_line['firstInstr'] = 2 + marker_barrier = list() + + ## Read code.json to get instruction idx + with open(code_fullname) as code_f: + code_data = json.load(code_f) + code_list = code_data['code'] + + found_1st_barrier = False + mfma_cnt = 0 + should_cnt = False + ## Find the s_barriers + for i in range(len(code_list)): + if "s_barrier" in code_list[i][0]: + marker_barrier.append(code_list[i]) + if not found_1st_barrier: + ## This is barrier1 + found_1st_barrier = True + should_cnt = True + else: + ## This is barrier2 or barrier3 + should_cnt = False + if "mfma" in code_list[i][0] and should_cnt: + mfma_cnt += 1 + + mfma_dsRead_cnt = mfma_cnt + mfma_dsWrite_cnt = 128 - mfma_cnt + + if len(marker_barrier) != 3: + print(f"Not 3 barriers?? Found {len(marker_barrier)}") + exit(0) + marker_to_line['barrier_before_ds_read'] = marker_barrier[0][2] + marker_to_line['instrAfterBarrier1'] = marker_barrier[0][2] + 1 + marker_to_line['barrier_before_ds_write'] = marker_barrier[1][2] + marker_to_line['instrAfterBarrier2'] = marker_barrier[1][2] + 1 + marker_to_line['barrier_after_loop'] = marker_barrier[2][2] + marker_to_line['instrAfterBarrier3'] = marker_barrier[2][2] + 1 + + instrAfterBarrier1_clk = list() + instrAfterBarrier2_clk = list() + instrAfterBarrier3_clk = 0 + firstInstr_clk = 0 + lastInstr_clk = 0 + + ## Read trace to get clk info for the markers + with open(trace_fullname) as trace_f: + trace_data = json.load(trace_f) + trace_list = trace_data['wave']['instructions'] + + for i in range(len(trace_list)): + ## Capture the clk for the first instruction in the kernel + if trace_list[i][-1] == marker_to_line['firstInstr']: + firstInstr_clk = trace_list[i][0] + ## Capture barrier1 + if trace_list[i][-1] == marker_to_line['instrAfterBarrier1']: + instrAfterBarrier1_clk.append(trace_list[i][0]) + ## Capture barrier2 + if trace_list[i][-1] == marker_to_line['instrAfterBarrier2']: + instrAfterBarrier2_clk.append(trace_list[i][0]) + ## Capture barrier3 + if trace_list[i][-1] == marker_to_line['instrAfterBarrier3']: + instrAfterBarrier3_clk = trace_list[i][0] + lastInstr_clk = trace_list[-1][0] + + if len(instrAfterBarrier1_clk) != len(instrAfterBarrier2_clk): + print("different length of instrAfterBarrier1_clk and instrAfterBarrier2_clk") + exit(0) + + return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt + + +def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk): + prologue = bar1_clk[0] - instr0_clk + loop = bar3_clk - bar1_clk[0] + epilogue = instr9_clk - bar3_clk + clk_per_iter = loop / len(bar1_clk) + return prologue, loop, epilogue, clk_per_iter + + +def gen_max_wid(code_fullname): + code_f = open(code_fullname) + code_data = json.load(code_f) + num_wv = code_data['code'][2][-2] + return int(num_wv / 8) + + +def gen_fine_clk(bar1_clk, bar2_clk, bar3_clk): + bar1_lat = list() + bar2_lat = list() + for i in range(len(bar1_clk)): + bar1_lat.append(bar2_clk[i] - bar1_clk[i]) + if i + 1 == len(bar1_clk): + bar2_lat.append(bar3_clk - bar2_clk[i]) + else: + bar2_lat.append(bar1_clk[i + 1] - bar2_clk[i]) + + return bar1_lat, bar2_lat + + +def list_to_stat(myList): + ave = mean(myList) + maxVal = max(myList) + minVal = min(myList) + stdVal = np.std(myList) + return int(ave), maxVal, minVal, stdVal + + +def print_loop_eff(list1, list2, cnt1, cnt2): + if len(list1) != len(list2): + print("lists do not have the same length!!") + exit(0) + + ave1, max1, min1, stddev1 = list_to_stat(list1) + ave2, max2, min2, stddev2 = list_to_stat(list2) + + return ave1, ave2, ave1 + ave2, ave1 - cnt1 * 2 * 16, ave2 - cnt2 * 2 * 16 + + +def calc_global_store_cycles(code_fullname): + f = open(code_fullname) + data = json.load(f) + idx = 0 + saw_store = False + total_mem = 0 + total_hitcnt = 0 + vmcnt_cnt = 0 + total_iss = 0 + total_iss_cnt = 0 + for i in data["code"]: + if "global_store" in i[0]: + global_store_name = i[0].split()[0] + total_iss += i[-1] / i[-2] + total_iss_cnt += i[-2] + saw_store = True + idx += 1 + if saw_store and "vmcnt(0)" in i[0]: + hitcnt = i[-2] + total_mem += i[-1] / i[-2] + total_hitcnt += hitcnt + vmcnt_cnt += 1 + + print(f"{idx} {global_store_name} {vmcnt_cnt} vmcnt(0)") + print(f"total cycles: {total_iss+total_mem:.0f} = {total_iss:.0f}(iss) + {total_mem:.0f}(mem)") + print(f"{total_iss/idx:.1f} issue cycles per {global_store_name}") + print(f"{total_mem/vmcnt_cnt:.1f} cycles per vmcnt(0)") + + +def main(): + args = parse_args() + trace_dir = args.d + se = args.se + sm = args.sm + sl = args.sl + wv = args.wv + + code_filename = "code.json" + code_fullname = os.path.join(trace_dir, code_filename) + trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wv}.json" + trace_fullname = os.path.join(trace_dir, trace_filename) + maxwid = gen_max_wid(code_fullname) + + print("wid,prologue,loop,epilogue,iter_clk,lat1,lat2,iter_lat,idle1,idle2") + epi_total = 0 + epi_1st = 0 + flag = False + cnt = 0 + for wid in range(maxwid): + if wv != -1 and wid != wv: + continue + trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json" + trace_fullname = os.path.join(trace_dir, trace_filename) + pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2 = parse_trace(code_fullname, trace_fullname) + print(f"{wid},{pro},{loop},{epi},{iter_clk:.0f},{lat1},{lat2},{lat_sum},{idle1},{idle2}") + if not flag: + epi_1st = epi + flag = False + + if epi > 2 * epi_1st: + continue + epi_total += epi + cnt += 1 + + print(f"averaged epilogue cycles: {epi_total / cnt:.0f}") + print(f"global_store info (averaged for all {maxwid} waves):") + calc_global_store_cycles(code_fullname) + + +if __name__ == '__main__': + sys.exit(main()) From 995144fe597e6f433309da589084aad93bec66e4 Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Mon, 30 Sep 2024 20:50:48 -0500 Subject: [PATCH 4/5] Recover/detect abnormal traces --- .../tools/tune_gemm/process_json.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/python/perf-kernels/tools/tune_gemm/process_json.py b/python/perf-kernels/tools/tune_gemm/process_json.py index 9a473b9e90f8..c762da69ac79 100644 --- a/python/perf-kernels/tools/tune_gemm/process_json.py +++ b/python/perf-kernels/tools/tune_gemm/process_json.py @@ -23,13 +23,15 @@ def parse_args(): def parse_trace(code_fullname, trace_fullname): - instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt = gen_all_clk( + instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete = gen_all_clk( code_fullname, trace_fullname) + if incomplete: + return 0, 0, 0, 0, 0, 0, 0, 0, 0, incomplete pro, loop, epi, iter_clk = gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk) bar1_lat, bar2_lat = gen_fine_clk(bar1_clk, bar2_clk, bar3_clk) lat1, lat2, lat_sum, idle1, idle2 = print_loop_eff(bar1_lat, bar2_lat, mfma_dsRead_cnt, mfma_dsWrite_cnt) - return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2 + return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete def print_list(myList): @@ -107,11 +109,19 @@ def gen_all_clk(code_fullname, trace_fullname): instrAfterBarrier3_clk = trace_list[i][0] lastInstr_clk = trace_list[-1][0] + incomplete = False if len(instrAfterBarrier1_clk) != len(instrAfterBarrier2_clk): print("different length of instrAfterBarrier1_clk and instrAfterBarrier2_clk") - exit(0) + incomplete = True + + len1 = len(instrAfterBarrier1_clk) + len2 = len(instrAfterBarrier2_clk) + len3 = instrAfterBarrier3_clk + + if len1 == 0 or len2 == 0 or len3 == 0: + incomplete = True - return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt + return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk): @@ -207,6 +217,7 @@ def main(): print("wid,prologue,loop,epilogue,iter_clk,lat1,lat2,iter_lat,idle1,idle2") epi_total = 0 epi_1st = 0 + total = 0 flag = False cnt = 0 for wid in range(maxwid): @@ -214,7 +225,9 @@ def main(): continue trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json" trace_fullname = os.path.join(trace_dir, trace_filename) - pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2 = parse_trace(code_fullname, trace_fullname) + pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete = parse_trace(code_fullname, trace_fullname) + if incomplete: + continue print(f"{wid},{pro},{loop},{epi},{iter_clk:.0f},{lat1},{lat2},{lat_sum},{idle1},{idle2}") if not flag: epi_1st = epi @@ -223,9 +236,13 @@ def main(): if epi > 2 * epi_1st: continue epi_total += epi + total += epi + pro + loop cnt += 1 + if cnt == 0: + exit(0) print(f"averaged epilogue cycles: {epi_total / cnt:.0f}") + print(f"averaged total cycles: {total / cnt:.0f}") print(f"global_store info (averaged for all {maxwid} waves):") calc_global_store_cycles(code_fullname) From d47adcede88ab79d590d0fdad27672137fd883ee Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 4 Oct 2024 11:18:51 -0500 Subject: [PATCH 5/5] Fix mfma cnt --- .../tools/tune_gemm/process_json.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/python/perf-kernels/tools/tune_gemm/process_json.py b/python/perf-kernels/tools/tune_gemm/process_json.py index c762da69ac79..aa2d478d7902 100644 --- a/python/perf-kernels/tools/tune_gemm/process_json.py +++ b/python/perf-kernels/tools/tune_gemm/process_json.py @@ -54,7 +54,8 @@ def gen_all_clk(code_fullname, trace_fullname): code_list = code_data['code'] found_1st_barrier = False - mfma_cnt = 0 + mfma_dsRead_cnt = 0 + mfma_cnt_total = 0 should_cnt = False ## Find the s_barriers for i in range(len(code_list)): @@ -68,10 +69,14 @@ def gen_all_clk(code_fullname, trace_fullname): ## This is barrier2 or barrier3 should_cnt = False if "mfma" in code_list[i][0] and should_cnt: - mfma_cnt += 1 + mfma_dsRead_cnt += 1 + if "mfma" in code_list[i][0]: + mfma_cnt_total += 1 - mfma_dsRead_cnt = mfma_cnt - mfma_dsWrite_cnt = 128 - mfma_cnt + ## /= 2 because the last iteration of local_load and tt.dot + ## is peeled off by stream-pipeliner + mfma_cnt_total /= 2 + mfma_dsWrite_cnt = mfma_cnt_total - mfma_dsRead_cnt if len(marker_barrier) != 3: print(f"Not 3 barriers?? Found {len(marker_barrier)}") @@ -121,7 +126,16 @@ def gen_all_clk(code_fullname, trace_fullname): if len1 == 0 or len2 == 0 or len3 == 0: incomplete = True - return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete + #print(f"{firstInstr_clk}") + #print(f"{instrAfterBarrier1_clk}") + #print(f"{instrAfterBarrier2_clk}") + #print(f"{instrAfterBarrier3_clk}") + #print(f"{lastInstr_clk}") + #print(f"{mfma_dsRead_cnt}") + #print(f"{mfma_dsWrite_cnt}") + #print(f"{incomplete}") + + return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, int(mfma_dsWrite_cnt), incomplete def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk): @@ -225,6 +239,9 @@ def main(): continue trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json" trace_fullname = os.path.join(trace_dir, trace_filename) + if not os.path.isfile(trace_fullname): + #print(f"trace file not found {trace_fullname}") + return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete = parse_trace(code_fullname, trace_fullname) if incomplete: continue