Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add performance reference for important matmul kernels #642

Open
wants to merge 5 commits into
base: main_perf
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions python/perf-kernels/tools/tune_gemm/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
trans,M,N,K,TFLOPS,us
TN,4864,4096,4096,467.39,349.19
TN,4864,4096,4160,567.17,292.26
TN,4864,4096,4224,557.49,301.90
TN,4864,4096,4288,569.55,299.99
TN,4864,4096,4097,501.58,325.47
TN,4864,4096,4098,491.96,331.92
TN,4864,4096,4100,503.51,324.46
TN,4864,4096,4104,515.70,317.10
TN,4864,4096,4112,525.66,311.70
TN,4864,8192,4096,519.95,627.79
TN,4864,8192,4160,579.14,572.43
TN,4864,8192,8192,543.30,1201.6
TN,4864,8192,8256,563.43,1167.7
24 changes: 24 additions & 0 deletions python/perf-kernels/tools/tune_gemm/config_fp16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@


## 2 workgroups / CU
#- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
#- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 8320, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 8384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
#- {'M': 4864, 'N': 8192, 'K': 12288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 12352, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 12416, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 12480, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}

- {'M': 9728, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 9728, 'N': 8192, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 9728, 'N': 8192, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}


- {'M': 9728, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 9728, 'N': 8192, 'K': 8320, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 9728, 'N': 8192, 'K': 8384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
18 changes: 18 additions & 0 deletions python/perf-kernels/tools/tune_gemm/database.yaml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are the purposes of this database file ? are they used to benchmark against the ref.csv ? if that's the case, what if the parameters changed, e.g GROUP_SIZE_M change from 4 to 8 ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may need other database file format if this is a daily/per commit tasks ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This database.yaml is used as the current best perf config. If you have an optimization that can improve the best perf number and requires a different config, we should update the database.
The main purpose is to catch regression.

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# M // BLOCK_M * N // BLOCK_N % 304 == 0
## 1 workgroup / CU
- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
## 1 workgroup / CU masked loadK
- {'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}

## 2 workgroups / CU
- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 0, 'waves_per_eu': 0, 'matrix_instr_nonkdim': 16, 'kpack': 2}
24 changes: 17 additions & 7 deletions python/perf-kernels/tools/tune_gemm/matmul_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,26 @@ def matmul_kernel(a_ptr, b_ptr, c_ptr, bias_ptr, M, N, K, stride_am, stride_ak,
bias = tl.load(bias_ptrs, mask=offs_am < M, other=0.0)
acc_dtype = tl.float32 if a_ptr.type.element_ty != tl.int8 else tl.int32
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype)
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
if EVEN_K:
a = tl.load(a_ptrs)
b = tl.load(b_ptrs)
else:
a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)

max_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) - 1
if EVEN_K:
max_k += 1
for k in range(0, max_k):
a = tl.load(tl.multiple_of(a_ptrs, (1, 16)))
b = tl.load(tl.multiple_of(b_ptrs, (16, 1)))
accumulator += tl.dot(a, b)
a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak
b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk

if not EVEN_K:
k = max_k
offs_k = k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
a_ptrsX = a_ptr + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
b_ptrsX = b_ptr + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
a = tl.load(a_ptrsX, mask=offs_k[None, :] < K, other=0.0)
b = tl.load(b_ptrsX, mask=offs_k[:, None] < K, other=0.0)
accumulator += tl.dot(a, b)

c = accumulator.to(c_ptr.type.element_ty)
if BIAS:
c += bias[:, None]
Expand Down
268 changes: 268 additions & 0 deletions python/perf-kernels/tools/tune_gemm/process_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
import numpy as np
from statistics import mean
import argparse
import sys
import json
import os


def parse_args():
parser = argparse.ArgumentParser(
prog="tune a specific gemm size",
allow_abbrev=False,
)

parser.add_argument("-d", type=str, default="", help='*_ui dir')
parser.add_argument("-se", type=int, default=0, help="")
parser.add_argument("-sm", type=int, default=0, help="")
parser.add_argument("-sl", type=int, default=0, help="")
parser.add_argument("-wv", type=int, default=-1, help="")

args = parser.parse_args()
return args


def parse_trace(code_fullname, trace_fullname):
instr0_clk, bar1_clk, bar2_clk, bar3_clk, instr9_clk, mfma_dsRead_cnt, mfma_dsWrite_cnt, incomplete = gen_all_clk(
code_fullname, trace_fullname)
if incomplete:
return 0, 0, 0, 0, 0, 0, 0, 0, 0, incomplete
pro, loop, epi, iter_clk = gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk)
bar1_lat, bar2_lat = gen_fine_clk(bar1_clk, bar2_clk, bar3_clk)

lat1, lat2, lat_sum, idle1, idle2 = print_loop_eff(bar1_lat, bar2_lat, mfma_dsRead_cnt, mfma_dsWrite_cnt)
return pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete


def print_list(myList):
for i in range(len(myList)):
print(myList[i])


def gen_all_clk(code_fullname, trace_fullname):
if not os.path.isfile(trace_fullname):
print(f"trace file not found {trace_fullname}")
return

marker_to_line = dict()
marker_to_line['firstInstr'] = 2
marker_barrier = list()

## Read code.json to get instruction idx
with open(code_fullname) as code_f:
code_data = json.load(code_f)
code_list = code_data['code']

found_1st_barrier = False
mfma_dsRead_cnt = 0
mfma_cnt_total = 0
should_cnt = False
## Find the s_barriers
for i in range(len(code_list)):
if "s_barrier" in code_list[i][0]:
marker_barrier.append(code_list[i])
if not found_1st_barrier:
## This is barrier1
found_1st_barrier = True
should_cnt = True
else:
## This is barrier2 or barrier3
should_cnt = False
if "mfma" in code_list[i][0] and should_cnt:
mfma_dsRead_cnt += 1
if "mfma" in code_list[i][0]:
mfma_cnt_total += 1

## /= 2 because the last iteration of local_load and tt.dot
## is peeled off by stream-pipeliner
mfma_cnt_total /= 2
mfma_dsWrite_cnt = mfma_cnt_total - mfma_dsRead_cnt

if len(marker_barrier) != 3:
print(f"Not 3 barriers?? Found {len(marker_barrier)}")
exit(0)
marker_to_line['barrier_before_ds_read'] = marker_barrier[0][2]
marker_to_line['instrAfterBarrier1'] = marker_barrier[0][2] + 1
marker_to_line['barrier_before_ds_write'] = marker_barrier[1][2]
marker_to_line['instrAfterBarrier2'] = marker_barrier[1][2] + 1
marker_to_line['barrier_after_loop'] = marker_barrier[2][2]
marker_to_line['instrAfterBarrier3'] = marker_barrier[2][2] + 1

instrAfterBarrier1_clk = list()
instrAfterBarrier2_clk = list()
instrAfterBarrier3_clk = 0
firstInstr_clk = 0
lastInstr_clk = 0

## Read trace to get clk info for the markers
with open(trace_fullname) as trace_f:
trace_data = json.load(trace_f)
trace_list = trace_data['wave']['instructions']

for i in range(len(trace_list)):
## Capture the clk for the first instruction in the kernel
if trace_list[i][-1] == marker_to_line['firstInstr']:
firstInstr_clk = trace_list[i][0]
## Capture barrier1
if trace_list[i][-1] == marker_to_line['instrAfterBarrier1']:
instrAfterBarrier1_clk.append(trace_list[i][0])
## Capture barrier2
if trace_list[i][-1] == marker_to_line['instrAfterBarrier2']:
instrAfterBarrier2_clk.append(trace_list[i][0])
## Capture barrier3
if trace_list[i][-1] == marker_to_line['instrAfterBarrier3']:
instrAfterBarrier3_clk = trace_list[i][0]
lastInstr_clk = trace_list[-1][0]

incomplete = False
if len(instrAfterBarrier1_clk) != len(instrAfterBarrier2_clk):
print("different length of instrAfterBarrier1_clk and instrAfterBarrier2_clk")
incomplete = True

len1 = len(instrAfterBarrier1_clk)
len2 = len(instrAfterBarrier2_clk)
len3 = instrAfterBarrier3_clk

if len1 == 0 or len2 == 0 or len3 == 0:
incomplete = True

#print(f"{firstInstr_clk}")
#print(f"{instrAfterBarrier1_clk}")
#print(f"{instrAfterBarrier2_clk}")
#print(f"{instrAfterBarrier3_clk}")
#print(f"{lastInstr_clk}")
#print(f"{mfma_dsRead_cnt}")
#print(f"{mfma_dsWrite_cnt}")
#print(f"{incomplete}")

return firstInstr_clk, instrAfterBarrier1_clk, instrAfterBarrier2_clk, instrAfterBarrier3_clk, lastInstr_clk, mfma_dsRead_cnt, int(mfma_dsWrite_cnt), incomplete


def gen_coarse_clk(instr0_clk, bar1_clk, bar3_clk, instr9_clk):
prologue = bar1_clk[0] - instr0_clk
loop = bar3_clk - bar1_clk[0]
epilogue = instr9_clk - bar3_clk
clk_per_iter = loop / len(bar1_clk)
return prologue, loop, epilogue, clk_per_iter


def gen_max_wid(code_fullname):
code_f = open(code_fullname)
code_data = json.load(code_f)
num_wv = code_data['code'][2][-2]
return int(num_wv / 8)


def gen_fine_clk(bar1_clk, bar2_clk, bar3_clk):
bar1_lat = list()
bar2_lat = list()
for i in range(len(bar1_clk)):
bar1_lat.append(bar2_clk[i] - bar1_clk[i])
if i + 1 == len(bar1_clk):
bar2_lat.append(bar3_clk - bar2_clk[i])
else:
bar2_lat.append(bar1_clk[i + 1] - bar2_clk[i])

return bar1_lat, bar2_lat


def list_to_stat(myList):
ave = mean(myList)
maxVal = max(myList)
minVal = min(myList)
stdVal = np.std(myList)
return int(ave), maxVal, minVal, stdVal


def print_loop_eff(list1, list2, cnt1, cnt2):
if len(list1) != len(list2):
print("lists do not have the same length!!")
exit(0)

ave1, max1, min1, stddev1 = list_to_stat(list1)
ave2, max2, min2, stddev2 = list_to_stat(list2)

return ave1, ave2, ave1 + ave2, ave1 - cnt1 * 2 * 16, ave2 - cnt2 * 2 * 16


def calc_global_store_cycles(code_fullname):
f = open(code_fullname)
data = json.load(f)
idx = 0
saw_store = False
total_mem = 0
total_hitcnt = 0
vmcnt_cnt = 0
total_iss = 0
total_iss_cnt = 0
for i in data["code"]:
if "global_store" in i[0]:
global_store_name = i[0].split()[0]
total_iss += i[-1] / i[-2]
total_iss_cnt += i[-2]
saw_store = True
idx += 1
if saw_store and "vmcnt(0)" in i[0]:
hitcnt = i[-2]
total_mem += i[-1] / i[-2]
total_hitcnt += hitcnt
vmcnt_cnt += 1

print(f"{idx} {global_store_name} {vmcnt_cnt} vmcnt(0)")
print(f"total cycles: {total_iss+total_mem:.0f} = {total_iss:.0f}(iss) + {total_mem:.0f}(mem)")
print(f"{total_iss/idx:.1f} issue cycles per {global_store_name}")
print(f"{total_mem/vmcnt_cnt:.1f} cycles per vmcnt(0)")


def main():
args = parse_args()
trace_dir = args.d
se = args.se
sm = args.sm
sl = args.sl
wv = args.wv

code_filename = "code.json"
code_fullname = os.path.join(trace_dir, code_filename)
trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wv}.json"
trace_fullname = os.path.join(trace_dir, trace_filename)
maxwid = gen_max_wid(code_fullname)

print("wid,prologue,loop,epilogue,iter_clk,lat1,lat2,iter_lat,idle1,idle2")
epi_total = 0
epi_1st = 0
total = 0
flag = False
cnt = 0
for wid in range(maxwid):
if wv != -1 and wid != wv:
continue
trace_filename = f"se{se}_sm{sm}_sl{sl}_wv{wid}.json"
trace_fullname = os.path.join(trace_dir, trace_filename)
if not os.path.isfile(trace_fullname):
#print(f"trace file not found {trace_fullname}")
return
pro, loop, epi, iter_clk, lat1, lat2, lat_sum, idle1, idle2, incomplete = parse_trace(code_fullname, trace_fullname)
if incomplete:
continue
print(f"{wid},{pro},{loop},{epi},{iter_clk:.0f},{lat1},{lat2},{lat_sum},{idle1},{idle2}")
if not flag:
epi_1st = epi
flag = False

if epi > 2 * epi_1st:
continue
epi_total += epi
total += epi + pro + loop
cnt += 1

if cnt == 0:
exit(0)
print(f"averaged epilogue cycles: {epi_total / cnt:.0f}")
print(f"averaged total cycles: {total / cnt:.0f}")
print(f"global_store info (averaged for all {maxwid} waves):")
calc_global_store_cycles(code_fullname)


if __name__ == '__main__':
sys.exit(main())
Loading
Loading