Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rocprofiler-sdk support for ROCMON #644

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
63ea8de
Add check for ROCM >= 6.2
TomTheBear Oct 8, 2024
90b84be
Split ROCM backends in 'v1' and 'sdk'
TomTheBear Oct 8, 2024
5ec039f
Filter files based on ROCM version check
TomTheBear Oct 8, 2024
90b6b2a
Rename defines in rocmon_v1_types
TomTheBear Oct 8, 2024
032ad5d
Rename groups for v1 and add groups for sdk
TomTheBear Oct 8, 2024
ae0c4e4
Add skeleton for rocmon sdk
TomTheBear Oct 8, 2024
abc8001
Update Rocprofiler SDK support. Not working yet
TomTheBear Oct 22, 2024
47b230e
Add check for ROCM >= 6.2
TomTheBear Oct 8, 2024
4c877e2
Split ROCM backends in 'v1' and 'sdk'
TomTheBear Oct 8, 2024
42ef30c
Filter files based on ROCM version check
TomTheBear Oct 8, 2024
bc1b8d0
Rename defines in rocmon_v1_types
TomTheBear Oct 8, 2024
ffa9338
Rename groups for v1 and add groups for sdk
TomTheBear Oct 8, 2024
82c33e9
Add skeleton for rocmon sdk
TomTheBear Oct 8, 2024
eac1c64
Update Rocprofiler SDK support. Not working yet
TomTheBear Oct 22, 2024
09a94f2
Delete group directory for amd_gpu_sdk, no differentiation required
TomTheBear Oct 31, 2024
100fb54
Check error code when initializing ROCm topology
TomTheBear Oct 31, 2024
a4eb7ee
Fix ERROR_PRINTS
TomTheBear Oct 31, 2024
3a782a4
Fix ERROR_PRINTS
TomTheBear Oct 31, 2024
0fff85f
Fix ERROR_PRINTS
TomTheBear Oct 31, 2024
df147a1
Fix ERROR_PRINTS
TomTheBear Oct 31, 2024
3a54fe6
Guard debug levels with ifdefs
TomTheBear Oct 31, 2024
0af128e
Rename groups again to amd_gpu
TomTheBear Oct 31, 2024
edb835d
Update Rocmon code
TomTheBear Oct 31, 2024
647b607
Always compile rocprofiler v1 support
TomTheBear Oct 31, 2024
b60e2f6
Fix uninitialized variable warnings
TomTheBear Oct 31, 2024
5a1de25
Update build config file
TomTheBear Oct 31, 2024
741c2ad
Merge branch 'rocm_sdk_update' of github.com:RRZE-HPC/likwid into roc…
TomTheBear Oct 31, 2024
d3e1eb8
Update code to work again but only v1 and smi, sdk still fails to init
TomTheBear Nov 10, 2024
1f4f9e1
Use typedef for bool only if not C99+
TomTheBear Nov 10, 2024
6277c48
Remove typedef for bool
TomTheBear Nov 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ ifneq ($(ROCM_INTERFACE), true)
OBJ := $(filter-out $(BUILD_DIR)/rocmon.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/rocmon_marker.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/topology_rocm.o,$(OBJ))
else
ifeq ($(strip $(ROCM_SDK_CHECK)),0)
OBJ := $(filter-out $(BUILD_DIR)/rocmon_sdk.o,$(OBJ))
endif
endif
ifeq ($(COMPILER),GCCPOWER)
OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ))
Expand Down Expand Up @@ -353,10 +357,16 @@ $(BUILD_DIR)/%.o: %.c
$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
$(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d

$(BUILD_DIR)/rocmon_marker.o: rocmon_marker.c
@echo "===> COMPILE $@"
$(BUILD_DIR)/rocmon_%.o: rocmon_%.c
@echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE"
$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
$(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE2 $@
$(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@

$(BUILD_DIR)/rocmon.o: rocmon.c
@echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE"
$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
$(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@


$(BUILD_DIR)/%.o: %.cc
@echo "===> COMPILE $@"
Expand Down
1 change: 0 additions & 1 deletion config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,5 @@ BUILDAPPDAEMON=false
# to be in the LD_LIBRARY_PATH to dynamically load the libraries.
# Include directory for ROCm headers
HSAINCLUDE = $(ROCM_HOME)/include
ROCPROFILERINCLUDE = $(ROCM_HOME)/include/rocprofiler
HIPINCLUDE = $(ROCM_HOME)/include
RSMIINCLUDE = $(ROCM_HOME)/include
18 changes: 9 additions & 9 deletions ext/GOTCHA/src/libc_wrappers.c
Original file line number Diff line number Diff line change
Expand Up @@ -426,8 +426,8 @@ int gotcha_int_printf(int fd, const char *format, ...) {
}

if (*str == 'd' || *str == 'i') {
signed long val;
char numstr[64];
signed long val = 0;
char numstr[64] = {'\0'};
if (char_width)
val = (signed long)(signed char)va_arg(args, signed int);
else if (short_width)
Expand All @@ -444,8 +444,8 @@ int gotcha_int_printf(int fd, const char *format, ...) {
add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer),
&num_printed, 1);
} else if (*str == 'u') {
unsigned long val;
char numstr[64];
unsigned long val = 0;
char numstr[64] = {'\0'};
if (char_width)
val = (unsigned long)(unsigned char)va_arg(args, unsigned int);
else if (short_width)
Expand All @@ -462,8 +462,8 @@ int gotcha_int_printf(int fd, const char *format, ...) {
add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer),
&num_printed, 1);
} else if (*str == 'x' || *str == 'X' || *str == 'p') {
unsigned long val;
char numstr[64];
unsigned long val = 0;
char numstr[64] = {'\0'};
if (*str != 'p') {
if (char_width)
val = (unsigned long)(unsigned char)va_arg(args, unsigned int);
Expand All @@ -486,7 +486,7 @@ int gotcha_int_printf(int fd, const char *format, ...) {
add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer),
&num_printed, 1);
} else if (*str == 'c') {
char cbuf[2];
char cbuf[2] = {'\0'};
cbuf[0] = (unsigned char)va_arg(args, unsigned int);
cbuf[1] = '\0';
add_to_buffer(cbuf, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed,
Expand All @@ -499,7 +499,7 @@ int gotcha_int_printf(int fd, const char *format, ...) {
add_to_buffer("%", fd, &buffer_pos, buffer, sizeof(buffer), &num_printed,
1);
} else {
char s[3];
char s[3] = {'\0'};
s[0] = '%';
s[1] = *str;
s[2] = '\0';
Expand All @@ -517,7 +517,7 @@ int gotcha_int_printf(int fd, const char *format, ...) {
}

void *gotcha_memset(void *s, int c, size_t n) {
size_t i;
size_t i = 0;
unsigned char byte = (unsigned char)c;
for (i = 0; i < n; i++) {
((unsigned char *)s)[i] = byte;
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
15 changes: 15 additions & 0 deletions groups/amd_gpu_v1/GDS.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SHORT GDS Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_GDS
ROCM1 ROCP_SQ_WAVES

METRICS
GPU GDS rw insts per work-item ROCM0/ROCM1

LONG
Formulas:
GPU GDS rw insts per work-item = ROCP_SQ_INSTS_GDS/ROCP_SQ_WAVES
--
The average number of GDS read or GDS write instructions executed
per work item (affected by flow control).
18 changes: 18 additions & 0 deletions groups/amd_gpu_v1/MEM.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SHORT Memory utilization

EVENTSET
ROCM0 ROCP_TA_TA_BUSY
ROCM1 ROCP_GRBM_GUI_ACTIVE
ROCM2 ROCP_SE_NUM

METRICS
GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2

LONG
Formulas:
GPU memory utilization = 100*max(ROCP_TA_TA_BUSY,16)/ROCP_GRBM_GUI_ACTIVE/ROCP_SE_NUM
--
The percentage of GPUTime the memory unit is active. The result includes
the stall time (MemUnitStalled). This is measured with all extra fetches
and writes and any cache or memory effects taken into account.
Value range: 0% to 100% (fetch-bound).
23 changes: 23 additions & 0 deletions groups/amd_gpu_v1/PCI.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
SHORT PCI Transfers

EVENTSET
ROCM0 RSMI_PCI_THROUGHPUT_SENT
ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED


METRICS
Runtime time
PCI sent ROCM0
PCI received ROCM1
PCI send bandwidth 1E-6*ROCM0/time
PCI recv bandwidth 1E-6*ROCM1/time

LONG
Formulas:
PCI sent = RSMI_PCI_THROUGHPUT_SENT
PCI received = RSMI_PCI_THROUGHPUT_RECEIVED
PCI send bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_SENT/runtime
PCI recv bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_RECEIVED/runtime
--
Currently not usable since the RSMI_PCI_THROUGHPUT_* events require
one second per call, so 2 seconds for both of them.
21 changes: 21 additions & 0 deletions groups/amd_gpu_v1/POWER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
SHORT Power, temperature and voltage

EVENTSET
ROCM0 RSMI_POWER_AVE[0]
ROCM1 RSMI_TEMP_EDGE
ROCM2 RSMI_VOLT_VDDGFX


METRICS
Power average 1E-6*ROCM0
Edge temperature 1E-3*ROCM1
Voltage 1E-3*ROCM2

LONG
Formulas:
Power average = RSMI_POWER_AVE[0]
Edge temperature = 1E-3*RSMI_TEMP_EDGE
Voltage = 1E-3*RSMI_VOLT_VDDGFX
--
Gets the current average power consumption in watts, the
temperature in celsius and the voltage in volts.
15 changes: 15 additions & 0 deletions groups/amd_gpu_v1/SALU.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SHORT SALU Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_SALU
ROCM1 ROCP_SQ_WAVES

METRICS
GPU SALU insts per work-item ROCM0/ROCM1

LONG
Formulas:
GPU SALU insts per work-item = ROCP_SQ_INSTS_SALU/ROCP_SQ_WAVES
--
The average number of scalar ALU instructions executed per work-item
(affected by flow control).
15 changes: 15 additions & 0 deletions groups/amd_gpu_v1/SFETCH.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SHORT SFetch Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_SMEM
ROCM1 ROCP_SQ_WAVES

METRICS
GPU SFETCH insts per work-item ROCM0/ROCM1

LONG
Formulas:
GPU SFETCH insts per work-item = ROCP_SQ_INSTS_SMEM/ROCP_SQ_WAVES
--
The average number of scalar fetch instructions from the video memory
executed per work-item (affected by flow control).
19 changes: 19 additions & 0 deletions groups/amd_gpu_v1/STALLED.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
SHORT ALU stalled by LDS

EVENTSET
ROCM0 ROCP_SQ_WAIT_INST_LDS
ROCM1 ROCP_SQ_WAVES
ROCM2 ROCP_GRBM_GUI_ACTIVE

METRICS
GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2

LONG
Formulas:
GPU ALD stalled = 100*ROCP_SQ_WAIT_INST_LDS*4/ROCP_SQ_WAVES/ROCP_GRBM_GUI_ACTIVE
--
The percentage of GPUTime ALU units are stalled by the LDS input queue
being full or the output queue being not ready. If there are LDS bank
conflicts, reduce them. Otherwise, try reducing the number of LDS
accesses if possible.
Value range: 0% (optimal) to 100% (bad).
18 changes: 18 additions & 0 deletions groups/amd_gpu_v1/UTIL.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SHORT GPU utilization

EVENTSET
ROCM0 ROCP_GRBM_COUNT
ROCM1 ROCP_GRBM_GUI_ACTIVE


METRICS
GPU utilization 100*ROCM1/ROCM0


LONG
Formulas:
GPU utilization = 100*ROCP_GRBM_GUI_ACTIVE/ROCP_GRBM_COUNT
--
This group reassembles the 'GPUBusy' metric provided by RocProfiler.
We should add, that we can select the GPUBusy metric directly and the
calculations are done internally in case the metric formula changes.
15 changes: 15 additions & 0 deletions groups/amd_gpu_v1/VALU.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SHORT VALU Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_VALU
ROCM1 ROCP_SQ_WAVES

METRICS
GPU VALU insts per work-item ROCM0/ROCM1

LONG
Formulas:
GPU VALU insts per work-item = ROCP_SQ_INSTS_VALU/ROCP_SQ_WAVES
--
The average number of vector ALU instructions executed per work-item
(affected by flow control).
15 changes: 15 additions & 0 deletions groups/amd_gpu_v1/WAVE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SHORT Wavefronts

EVENTSET
ROCM0 ROCP_SQ_WAVES


METRICS
GPU wavefronts ROCM0


LONG
Formulas:
GPU wavefronts = ROCP_SQ_WAVES
--
Total Wavefronts
6 changes: 5 additions & 1 deletion make/config_checks.mk
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,11 @@ INCLUDES += -I$(CUDAINCLUDE) -I$(CUPTIINCLUDE)
endif

ifeq ($(strip $(ROCM_INTERFACE)), true)
ROCM_SDK_CHECK := $(shell which rocprofv3 2>/dev/null | wc -l)
# HSA includes 'hsa/xxx.h' and rocprofiler 'xxx.h'
DEFINES += -D__HIP_PLATFORM_AMD__
INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE)
INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(RSMIINCLUDE)
ifeq ($(strip $(ROCM_SDK_CHECK)),1)
DEFINES += -DLIKWID_ROCPROF_SDK
endif
endif
4 changes: 2 additions & 2 deletions src/access_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,15 +255,15 @@ access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct soc
io_count = send(socket_fd, (char*) &io_buf, sizeof(io_buf), 0);

if (io_count != sizeof(io_buf)) {
ERROR_PRINT(Failed to send msg to the bridge socket)
ERROR_PRINT(Failed to send msg to the bridge socket);
close(socket_fd);
return -1;
}

io_count = recv(socket_fd, (char*) &io_buf, sizeof(io_buf), 0);

if (io_count != sizeof(io_buf)) {
ERROR_PRINT(Failed to recv msg from the bridge socket)
ERROR_PRINT(Failed to recv msg from the bridge socket);
close(socket_fd);
return -1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/access_x86_msr.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ access_x86_msr_init(const int cpu_id)
fd = open(msr_file_name, O_RDWR);
if (fd < 0)
{
ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno))
ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno));
ERROR_PLAIN_PRINT(Please check if 'msr' module is loaded and device files have correct permissions);
ERROR_PLAIN_PRINT(Alternatively you might want to look into (sys)daemonmode);
free(msr_file_name);
Expand Down
4 changes: 2 additions & 2 deletions src/applications/likwid.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1576,7 +1576,7 @@ end
likwid.getMarkerResultsCuda = getMarkerResultsCuda

local function getMarkerResultsRocm(filename, gpulist, nan2value)
local gputopo = likwid.getGpuTopology_rocm()
local gputopo = likwid.getRocmTopology()
local ret = likwid.readMarkerFileRocm(filename)
if ret < 0 then
return nil, nil
Expand Down Expand Up @@ -1627,7 +1627,7 @@ likwid.getMarkerResultsRocm = getMarkerResultsRocm

local function printOutputRocm(results, metrics, gpulist, region, stats)
local maxLineFields = 0
local gputopo = likwid.getGpuTopology_rocm()
local gputopo = likwid.getRocmTopology()
local regionName = likwid.markerRegionTagRocm(region)
local regionGPUs = likwid.markerRegionGpusRocm(region)
local cur_gpulist = gpulist
Expand Down
7 changes: 6 additions & 1 deletion src/cpustring.c
Original file line number Diff line number Diff line change
Expand Up @@ -1036,10 +1036,15 @@ int
gpustr_to_gpulist_rocm(const char* gpustr, int* gpulist, int length)
{
int insert = 0;
topology_rocm_init();
int ret = topology_rocm_init();
if (ret < 0)
{
return ret;
}
RocmTopology_t gpu_topology = get_rocmTopology();
bstring bgpustr = bfromcstr(gpustr);
struct bstrList* commalist = bsplit(bgpustr, ',');
bdestroy(bgpustr);
for (int i = 0; i < commalist->qty; i++)
{
if (bstrchrp(commalist->entry[i], '-', 0) != BSTR_ERR)
Expand Down
Loading
Loading