diff --git a/Makefile b/Makefile index 3e2690421..06f92b1c9 100644 --- a/Makefile +++ b/Makefile @@ -151,6 +151,10 @@ ifneq ($(ROCM_INTERFACE), true) OBJ := $(filter-out $(BUILD_DIR)/rocmon.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/rocmon_marker.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/topology_rocm.o,$(OBJ)) +else +ifeq ($(strip $(ROCM_SDK_CHECK)),0) +OBJ := $(filter-out $(BUILD_DIR)/rocmon_sdk.o,$(OBJ)) +endif endif ifeq ($(COMPILER),GCCPOWER) OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ)) @@ -353,10 +357,16 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ $(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d -$(BUILD_DIR)/rocmon_marker.o: rocmon_marker.c - @echo "===> COMPILE $@" +$(BUILD_DIR)/rocmon_%.o: rocmon_%.c + @echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE" $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ - $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE2 $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@ + +$(BUILD_DIR)/rocmon.o: rocmon.c + @echo "===> COMPILE $@ with redefined symbol HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE" + $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE_$@ $@ + $(BUILD_DIR)/%.o: %.cc @echo "===> COMPILE $@" diff --git a/config.mk b/config.mk index 76b473c4a..69c1a88ae 100644 --- a/config.mk +++ b/config.mk @@ -197,6 +197,5 @@ BUILDAPPDAEMON=false # to be in the LD_LIBRARY_PATH to dynamically load the libraries. # Include directory for ROCm headers HSAINCLUDE = $(ROCM_HOME)/include -ROCPROFILERINCLUDE = $(ROCM_HOME)/include/rocprofiler HIPINCLUDE = $(ROCM_HOME)/include RSMIINCLUDE = $(ROCM_HOME)/include diff --git a/ext/GOTCHA/src/libc_wrappers.c b/ext/GOTCHA/src/libc_wrappers.c index 30608e1bb..cb5532d85 100644 --- a/ext/GOTCHA/src/libc_wrappers.c +++ b/ext/GOTCHA/src/libc_wrappers.c @@ -426,8 +426,8 @@ int gotcha_int_printf(int fd, const char *format, ...) { } if (*str == 'd' || *str == 'i') { - signed long val; - char numstr[64]; + signed long val = 0; + char numstr[64] = {'\0'}; if (char_width) val = (signed long)(signed char)va_arg(args, signed int); else if (short_width) @@ -444,8 +444,8 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else if (*str == 'u') { - unsigned long val; - char numstr[64]; + unsigned long val = 0; + char numstr[64] = {'\0'}; if (char_width) val = (unsigned long)(unsigned char)va_arg(args, unsigned int); else if (short_width) @@ -462,8 +462,8 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else if (*str == 'x' || *str == 'X' || *str == 'p') { - unsigned long val; - char numstr[64]; + unsigned long val = 0; + char numstr[64] = {'\0'}; if (*str != 'p') { if (char_width) val = (unsigned long)(unsigned char)va_arg(args, unsigned int); @@ -486,7 +486,7 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else if (*str == 'c') { - char cbuf[2]; + char cbuf[2] = {'\0'}; cbuf[0] = (unsigned char)va_arg(args, unsigned int); cbuf[1] = '\0'; add_to_buffer(cbuf, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, @@ -499,7 +499,7 @@ int gotcha_int_printf(int fd, const char *format, ...) { add_to_buffer("%", fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1); } else { - char s[3]; + char s[3] = {'\0'}; s[0] = '%'; s[1] = *str; s[2] = '\0'; @@ -517,7 +517,7 @@ int gotcha_int_printf(int fd, const char *format, ...) { } void *gotcha_memset(void *s, int c, size_t n) { - size_t i; + size_t i = 0; unsigned char byte = (unsigned char)c; for (i = 0; i < n; i++) { ((unsigned char *)s)[i] = byte; diff --git a/groups/amd_gpu/GDS.txt b/groups/amd_gpu_sdk/GDS.txt similarity index 100% rename from groups/amd_gpu/GDS.txt rename to groups/amd_gpu_sdk/GDS.txt diff --git a/groups/amd_gpu/MEM.txt b/groups/amd_gpu_sdk/MEM.txt similarity index 100% rename from groups/amd_gpu/MEM.txt rename to groups/amd_gpu_sdk/MEM.txt diff --git a/groups/amd_gpu/PCI.txt b/groups/amd_gpu_sdk/PCI.txt similarity index 100% rename from groups/amd_gpu/PCI.txt rename to groups/amd_gpu_sdk/PCI.txt diff --git a/groups/amd_gpu/POWER.txt b/groups/amd_gpu_sdk/POWER.txt similarity index 100% rename from groups/amd_gpu/POWER.txt rename to groups/amd_gpu_sdk/POWER.txt diff --git a/groups/amd_gpu/SALU.txt b/groups/amd_gpu_sdk/SALU.txt similarity index 100% rename from groups/amd_gpu/SALU.txt rename to groups/amd_gpu_sdk/SALU.txt diff --git a/groups/amd_gpu/SFETCH.txt b/groups/amd_gpu_sdk/SFETCH.txt similarity index 100% rename from groups/amd_gpu/SFETCH.txt rename to groups/amd_gpu_sdk/SFETCH.txt diff --git a/groups/amd_gpu/STALLED.txt b/groups/amd_gpu_sdk/STALLED.txt similarity index 100% rename from groups/amd_gpu/STALLED.txt rename to groups/amd_gpu_sdk/STALLED.txt diff --git a/groups/amd_gpu/UTIL.txt b/groups/amd_gpu_sdk/UTIL.txt similarity index 100% rename from groups/amd_gpu/UTIL.txt rename to groups/amd_gpu_sdk/UTIL.txt diff --git a/groups/amd_gpu/VALU.txt b/groups/amd_gpu_sdk/VALU.txt similarity index 100% rename from groups/amd_gpu/VALU.txt rename to groups/amd_gpu_sdk/VALU.txt diff --git a/groups/amd_gpu/WAVE.txt b/groups/amd_gpu_sdk/WAVE.txt similarity index 100% rename from groups/amd_gpu/WAVE.txt rename to groups/amd_gpu_sdk/WAVE.txt diff --git a/groups/amd_gpu_v1/GDS.txt b/groups/amd_gpu_v1/GDS.txt new file mode 100644 index 000000000..39c3446be --- /dev/null +++ b/groups/amd_gpu_v1/GDS.txt @@ -0,0 +1,15 @@ +SHORT GDS Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_GDS +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU GDS rw insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU GDS rw insts per work-item = ROCP_SQ_INSTS_GDS/ROCP_SQ_WAVES +-- +The average number of GDS read or GDS write instructions executed +per work item (affected by flow control). diff --git a/groups/amd_gpu_v1/MEM.txt b/groups/amd_gpu_v1/MEM.txt new file mode 100644 index 000000000..acc63a627 --- /dev/null +++ b/groups/amd_gpu_v1/MEM.txt @@ -0,0 +1,18 @@ +SHORT Memory utilization + +EVENTSET +ROCM0 ROCP_TA_TA_BUSY +ROCM1 ROCP_GRBM_GUI_ACTIVE +ROCM2 ROCP_SE_NUM + +METRICS +GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2 + +LONG +Formulas: +GPU memory utilization = 100*max(ROCP_TA_TA_BUSY,16)/ROCP_GRBM_GUI_ACTIVE/ROCP_SE_NUM +-- +The percentage of GPUTime the memory unit is active. The result includes +the stall time (MemUnitStalled). This is measured with all extra fetches +and writes and any cache or memory effects taken into account. +Value range: 0% to 100% (fetch-bound). diff --git a/groups/amd_gpu_v1/PCI.txt b/groups/amd_gpu_v1/PCI.txt new file mode 100644 index 000000000..cefaf307d --- /dev/null +++ b/groups/amd_gpu_v1/PCI.txt @@ -0,0 +1,23 @@ +SHORT PCI Transfers + +EVENTSET +ROCM0 RSMI_PCI_THROUGHPUT_SENT +ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED + + +METRICS +Runtime time +PCI sent ROCM0 +PCI received ROCM1 +PCI send bandwidth 1E-6*ROCM0/time +PCI recv bandwidth 1E-6*ROCM1/time + +LONG +Formulas: +PCI sent = RSMI_PCI_THROUGHPUT_SENT +PCI received = RSMI_PCI_THROUGHPUT_RECEIVED +PCI send bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_SENT/runtime +PCI recv bandwidth = 1E-6*RSMI_PCI_THROUGHPUT_RECEIVED/runtime +-- +Currently not usable since the RSMI_PCI_THROUGHPUT_* events require +one second per call, so 2 seconds for both of them. diff --git a/groups/amd_gpu_v1/POWER.txt b/groups/amd_gpu_v1/POWER.txt new file mode 100644 index 000000000..49830efc0 --- /dev/null +++ b/groups/amd_gpu_v1/POWER.txt @@ -0,0 +1,21 @@ +SHORT Power, temperature and voltage + +EVENTSET +ROCM0 RSMI_POWER_AVE[0] +ROCM1 RSMI_TEMP_EDGE +ROCM2 RSMI_VOLT_VDDGFX + + +METRICS +Power average 1E-6*ROCM0 +Edge temperature 1E-3*ROCM1 +Voltage 1E-3*ROCM2 + +LONG +Formulas: +Power average = RSMI_POWER_AVE[0] +Edge temperature = 1E-3*RSMI_TEMP_EDGE +Voltage = 1E-3*RSMI_VOLT_VDDGFX +-- +Gets the current average power consumption in watts, the +temperature in celsius and the voltage in volts. diff --git a/groups/amd_gpu_v1/SALU.txt b/groups/amd_gpu_v1/SALU.txt new file mode 100644 index 000000000..a693421d1 --- /dev/null +++ b/groups/amd_gpu_v1/SALU.txt @@ -0,0 +1,15 @@ +SHORT SALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SALU insts per work-item = ROCP_SQ_INSTS_SALU/ROCP_SQ_WAVES +-- +The average number of scalar ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/SFETCH.txt b/groups/amd_gpu_v1/SFETCH.txt new file mode 100644 index 000000000..bd0dfc3ff --- /dev/null +++ b/groups/amd_gpu_v1/SFETCH.txt @@ -0,0 +1,15 @@ +SHORT SFetch Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SMEM +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SFETCH insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU SFETCH insts per work-item = ROCP_SQ_INSTS_SMEM/ROCP_SQ_WAVES +-- +The average number of scalar fetch instructions from the video memory +executed per work-item (affected by flow control). diff --git a/groups/amd_gpu_v1/STALLED.txt b/groups/amd_gpu_v1/STALLED.txt new file mode 100644 index 000000000..9d6dc42c4 --- /dev/null +++ b/groups/amd_gpu_v1/STALLED.txt @@ -0,0 +1,19 @@ +SHORT ALU stalled by LDS + +EVENTSET +ROCM0 ROCP_SQ_WAIT_INST_LDS +ROCM1 ROCP_SQ_WAVES +ROCM2 ROCP_GRBM_GUI_ACTIVE + +METRICS +GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2 + +LONG +Formulas: +GPU ALD stalled = 100*ROCP_SQ_WAIT_INST_LDS*4/ROCP_SQ_WAVES/ROCP_GRBM_GUI_ACTIVE +-- +The percentage of GPUTime ALU units are stalled by the LDS input queue +being full or the output queue being not ready. If there are LDS bank +conflicts, reduce them. Otherwise, try reducing the number of LDS +accesses if possible. +Value range: 0% (optimal) to 100% (bad). diff --git a/groups/amd_gpu_v1/UTIL.txt b/groups/amd_gpu_v1/UTIL.txt new file mode 100644 index 000000000..7d9271e11 --- /dev/null +++ b/groups/amd_gpu_v1/UTIL.txt @@ -0,0 +1,18 @@ +SHORT GPU utilization + +EVENTSET +ROCM0 ROCP_GRBM_COUNT +ROCM1 ROCP_GRBM_GUI_ACTIVE + + +METRICS +GPU utilization 100*ROCM1/ROCM0 + + +LONG +Formulas: +GPU utilization = 100*ROCP_GRBM_GUI_ACTIVE/ROCP_GRBM_COUNT +-- +This group reassembles the 'GPUBusy' metric provided by RocProfiler. +We should add, that we can select the GPUBusy metric directly and the +calculations are done internally in case the metric formula changes. diff --git a/groups/amd_gpu_v1/VALU.txt b/groups/amd_gpu_v1/VALU.txt new file mode 100644 index 000000000..5d57b9b20 --- /dev/null +++ b/groups/amd_gpu_v1/VALU.txt @@ -0,0 +1,15 @@ +SHORT VALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_VALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU VALU insts per work-item ROCM0/ROCM1 + +LONG +Formulas: +GPU VALU insts per work-item = ROCP_SQ_INSTS_VALU/ROCP_SQ_WAVES +-- +The average number of vector ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu_v1/WAVE.txt b/groups/amd_gpu_v1/WAVE.txt new file mode 100644 index 000000000..fe8914ae1 --- /dev/null +++ b/groups/amd_gpu_v1/WAVE.txt @@ -0,0 +1,15 @@ +SHORT Wavefronts + +EVENTSET +ROCM0 ROCP_SQ_WAVES + + +METRICS +GPU wavefronts ROCM0 + + +LONG +Formulas: +GPU wavefronts = ROCP_SQ_WAVES +-- +Total Wavefronts diff --git a/make/config_checks.mk b/make/config_checks.mk index c84edd4a2..3d6825ef1 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -84,7 +84,11 @@ INCLUDES += -I$(CUDAINCLUDE) -I$(CUPTIINCLUDE) endif ifeq ($(strip $(ROCM_INTERFACE)), true) +ROCM_SDK_CHECK := $(shell which rocprofv3 2>/dev/null | wc -l) # HSA includes 'hsa/xxx.h' and rocprofiler 'xxx.h' DEFINES += -D__HIP_PLATFORM_AMD__ -INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) +INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(RSMIINCLUDE) +ifeq ($(strip $(ROCM_SDK_CHECK)),1) +DEFINES += -DLIKWID_ROCPROF_SDK +endif endif diff --git a/src/access_client.c b/src/access_client.c index 9cb1b4715..5f558319d 100644 --- a/src/access_client.c +++ b/src/access_client.c @@ -255,7 +255,7 @@ access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct soc io_count = send(socket_fd, (char*) &io_buf, sizeof(io_buf), 0); if (io_count != sizeof(io_buf)) { - ERROR_PRINT(Failed to send msg to the bridge socket) + ERROR_PRINT(Failed to send msg to the bridge socket); close(socket_fd); return -1; } @@ -263,7 +263,7 @@ access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct soc io_count = recv(socket_fd, (char*) &io_buf, sizeof(io_buf), 0); if (io_count != sizeof(io_buf)) { - ERROR_PRINT(Failed to recv msg from the bridge socket) + ERROR_PRINT(Failed to recv msg from the bridge socket); close(socket_fd); return -1; } diff --git a/src/access_x86_msr.c b/src/access_x86_msr.c index d023b1082..aff42b940 100644 --- a/src/access_x86_msr.c +++ b/src/access_x86_msr.c @@ -122,7 +122,7 @@ access_x86_msr_init(const int cpu_id) fd = open(msr_file_name, O_RDWR); if (fd < 0) { - ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno)) + ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno)); ERROR_PLAIN_PRINT(Please check if 'msr' module is loaded and device files have correct permissions); ERROR_PLAIN_PRINT(Alternatively you might want to look into (sys)daemonmode); free(msr_file_name); diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua index 638d32d40..0654932e2 100644 --- a/src/applications/likwid.lua +++ b/src/applications/likwid.lua @@ -1576,7 +1576,7 @@ end likwid.getMarkerResultsCuda = getMarkerResultsCuda local function getMarkerResultsRocm(filename, gpulist, nan2value) - local gputopo = likwid.getGpuTopology_rocm() + local gputopo = likwid.getRocmTopology() local ret = likwid.readMarkerFileRocm(filename) if ret < 0 then return nil, nil @@ -1627,7 +1627,7 @@ likwid.getMarkerResultsRocm = getMarkerResultsRocm local function printOutputRocm(results, metrics, gpulist, region, stats) local maxLineFields = 0 - local gputopo = likwid.getGpuTopology_rocm() + local gputopo = likwid.getRocmTopology() local regionName = likwid.markerRegionTagRocm(region) local regionGPUs = likwid.markerRegionGpusRocm(region) local cur_gpulist = gpulist diff --git a/src/cpustring.c b/src/cpustring.c index 63ca736e8..19167aa7e 100644 --- a/src/cpustring.c +++ b/src/cpustring.c @@ -1036,10 +1036,15 @@ int gpustr_to_gpulist_rocm(const char* gpustr, int* gpulist, int length) { int insert = 0; - topology_rocm_init(); + int ret = topology_rocm_init(); + if (ret < 0) + { + return ret; + } RocmTopology_t gpu_topology = get_rocmTopology(); bstring bgpustr = bfromcstr(gpustr); struct bstrList* commalist = bsplit(bgpustr, ','); + bdestroy(bgpustr); for (int i = 0; i < commalist->qty; i++) { if (bstrchrp(commalist->entry[i], '-', 0) != BSTR_ERR) diff --git a/src/frequency_cpu.c b/src/frequency_cpu.c index 30c28bd56..3dd0fe693 100644 --- a/src/frequency_cpu.c +++ b/src/frequency_cpu.c @@ -632,7 +632,7 @@ static int getAMDTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -641,7 +641,7 @@ static int getAMDTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -680,7 +680,7 @@ static int setAMDTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -689,7 +689,7 @@ static int setAMDTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -741,7 +741,7 @@ static int getIntelTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -750,7 +750,7 @@ static int getIntelTurbo(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -789,7 +789,7 @@ static int setIntelTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -798,7 +798,7 @@ static int setIntelTurbo(const int cpu_id, const int turbo) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -858,7 +858,7 @@ static int getIntelHWP(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -867,7 +867,7 @@ static int getIntelHWP(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -904,7 +904,7 @@ static int getBaseFreq(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } @@ -913,7 +913,7 @@ static int getBaseFreq(const int cpu_id) err = HPMaddThread(cpu_id); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return err; } } diff --git a/src/frequency_uncore.c b/src/frequency_uncore.c index d4667fa74..5e5de03f6 100644 --- a/src/frequency_uncore.c +++ b/src/frequency_uncore.c @@ -198,7 +198,7 @@ int freq_setUncoreFreqMin(const int socket_id, const uint64_t freq) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -267,7 +267,7 @@ uint64_t freq_getUncoreFreqMin(const int socket_id) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -329,7 +329,7 @@ int freq_setUncoreFreqMax(const int socket_id, const uint64_t freq) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -396,7 +396,7 @@ uint64_t freq_getUncoreFreqMax(const int socket_id) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } @@ -454,7 +454,7 @@ uint64_t freq_getUncoreFreqCur(const int socket_id) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PLAIN_PRINT(Cannot get access to MSRs) + ERROR_PLAIN_PRINT(Cannot get access to MSRs); return 0; } } diff --git a/src/includes/error.h b/src/includes/error.h index 739ab8cef..8982ff003 100644 --- a/src/includes/error.h +++ b/src/includes/error.h @@ -43,10 +43,10 @@ exit(EXIT_FAILURE) #define ERROR_PLAIN_PRINT(msg) \ - fprintf(stderr, "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__); + fprintf(stderr, "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__) #define ERROR_PRINT(fmt, ...) \ - fprintf(stderr, "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__, __func__,__LINE__, strerror(errno), ##__VA_ARGS__); + fprintf(stderr, "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__, __func__,__LINE__, strerror(errno), ##__VA_ARGS__) #define CHECK_ERROR(func, msg) \ if ((func) < 0) { \ @@ -65,6 +65,19 @@ exit(EXIT_FAILURE); \ } +#ifndef DEBUGLEV_ONLY_ERROR +#define DEBUGLEV_ONLY_ERROR 0 +#endif +#ifndef DEBUGLEV_INFO +#define DEBUGLEV_INFO 1 +#endif +#ifndef DEBUGLEV_DETAIL +#define DEBUGLEV_DETAIL 2 +#endif +#ifndef DEBUGLEV_DEVELOP +#define DEBUGLEV_DEVELOP 3 +#endif + #define VERBOSEPRINTREG(cpuid,reg,flags,msg) \ if (perfmon_verbosity >= DEBUGLEV_DETAIL) \ { \ diff --git a/src/includes/likwid.h b/src/includes/likwid.h index 146a226ae..3a6a1608a 100644 --- a/src/includes/likwid.h +++ b/src/includes/likwid.h @@ -38,10 +38,18 @@ #include +#ifndef DEBUGLEV_ONLY_ERROR #define DEBUGLEV_ONLY_ERROR 0 +#endif +#ifndef DEBUGLEV_INFO #define DEBUGLEV_INFO 1 +#endif +#ifndef DEBUGLEV_DETAIL #define DEBUGLEV_DETAIL 2 +#endif +#ifndef DEBUGLEV_DEVELOP #define DEBUGLEV_DEVELOP 3 +#endif #define LIKWID_VERSION "VERSION.RELEASE.MINORVERSION" #define LIKWID_COMMIT GITCOMMIT diff --git a/src/includes/rocmon.h b/src/includes/rocmon.h new file mode 100644 index 000000000..896138a99 --- /dev/null +++ b/src/includes/rocmon.h @@ -0,0 +1,8 @@ +#ifndef LIKWID_INTERNAL_ROCMON_H +#define LIKWID_INTERNAL_ROCMON_H + +#include + +GroupInfo* rocmon_get_group(int gid); + +#endif diff --git a/src/includes/rocmon_common_types.h b/src/includes/rocmon_common_types.h new file mode 100644 index 000000000..fe48bc866 --- /dev/null +++ b/src/includes/rocmon_common_types.h @@ -0,0 +1,228 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_common_types.h + * + * Description: Header File of rocmon for v1 and sdk backend. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_COMMON_TYPES_H +#define LIKWID_ROCMON_COMMON_TYPES_H + +#include + +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include +#include +#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include +#ifdef LIKWID_ROCPROF_SDK +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include +/*#ifdef ROCPROFILER_EXPORT*/ +/*#undef ROCPROFILER_EXPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_IMPORT*/ +/*#undef ROCPROFILER_IMPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MAJOR*/ +/*#undef ROCPROFILER_VERSION_MAJOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MINOR*/ +/*#undef ROCPROFILER_VERSION_MINOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include +#endif + + + +#ifndef ROCMWEAK +#define ROCMWEAK __attribute__(( weak )) +#endif +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#endif +/*#ifndef ARRAY_COUNT*/ +/*#define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0]))*/ +/*#endif*/ +/*#ifndef SIZEOF_STRUCT_MEMBER*/ +/*#define SIZEOF_STRUCT_MEMBER(type, member) (sizeof(((type *) NULL)->member))*/ +/*#endif*/ + +typedef struct { + double lastValue; + double fullValue; +} RocmonEventResult; + +typedef struct { + RocmonEventResult* results; // First rocprofiler results, then SMI results + int numResults; +} RocmonEventResultList; + +#include +#include + +typedef struct { + bstring tag; + int groupID; + int gpuCount; + int eventCount; + double* time; + uint32_t* count; + int* gpulist; + double** counters; +} LikwidRocmResults; + +typedef struct { + int deviceId; // LIKWID device id + int rocprof_v1; + int activeGroup; + + // Rocprofiler V1 + hsa_agent_t hsa_agent; // HSA agent handle for this device + rocprofiler_t* v1_context; // Rocprofiler context (has activeEvents configured) +#ifdef LIKWID_ROCPROF_SDK + // Rocprofiler SDK + rocprofiler_agent_t agent; + rocprofiler_context_id_t sdk_context; // Rocprofiler context (has activeEvents configured) + rocprofiler_buffer_id_t buffer; + rocprofiler_callback_thread_t thread; +#endif + + // Available rocprofiler metrics + rocprofiler_info_data_t* v1_rocMetrics; +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_counter_info_v0_t* sdk_rocMetrics; +#endif + int numRocMetrics; + + // Available ROCm SMI events + Map_t smiMetrics; + + // Currently configured rocprofiler events (bound to context) + rocprofiler_feature_t* v1_activeRocEvents; +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_counter_info_v0_t* sdk_activeRocEvents; +#endif + int numActiveRocEvents; + + // Currently configured ROCm SMI events + RocmonSmiEvent* activeSmiEvents; + int numActiveSmiEvents; + + // Results for all events in all event sets + RocmonEventResultList* groupResults; + int numGroupResults; + +#ifdef LIKWID_ROCPROF_SDK + rocprofiler_profile_config_id_t* profiles; + int numProfiles; +#endif + + // Timestamps in ns + struct { + uint64_t start; + uint64_t read; + uint64_t stop; + } time; + + // buffer? +} RocmonDevice; + +typedef enum { + ROCMON_STATE_FINALIZED = 0, + ROCMON_STATE_INITIALIZED, + ROCMON_STATE_SETUP, + ROCMON_STATE_RUNNING, + ROCMON_STATE_STOPPED, + MAX_ROCMON_STATE, +} RocmonContextState; +#define MIN_ROCMON_STATE ROCMON_STATE_FINALIZED + +typedef struct { + int numGroups; // Number of allocated groups + int numActiveGroups; // Number of used groups + int activeGroup; // Currently active group + GroupInfo *groups; + + // Devices (HSA agents) + RocmonDevice *devices; + int numDevices; + + // System information + long double hsa_timestamp_factor; // hsa_timestamp * hsa_timestamp_factor = timestamp_in_ns + + // Rocprofiler SDK agents with buffers +#ifdef LIKWID_ROCPROF_SDK + int num_sdk_agents; + RocprofilerSdkAgentData* agents; +#endif + + // ROCm SMI events + Map_t smiEvents; + + // Use legacy rocprofiler v1 + int use_rocprofiler_v1:1; + RocmonContextState state; +} RocmonContext; + +//extern static RocmonContext* rocmon_context; + + +#endif /* LIKWID_ROCMON_COMMON_TYPES_H */ diff --git a/src/includes/rocmon_sdk.h b/src/includes/rocmon_sdk.h new file mode 100644 index 000000000..9aa6820f6 --- /dev/null +++ b/src/includes/rocmon_sdk.h @@ -0,0 +1,1337 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_sdk.h + * + * Description: Header File of rocmon module for ROCm >= 6.2. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SDK_H +#define LIKWID_ROCMON_SDK_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +static int rocmon_sdk_initialized = FALSE; + +static void *rocmon_sdk_dl_profiler_lib = NULL; +static void *rocmon_sdk_dl_hsa_lib = NULL; +//static void *rocmon_sdk_dl_rsmi_lib = NULL; + + +// setup function for rocprofiler sdk +//rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t, const char*, uint32_t, rocprofiler_client_id_t*); + +#ifndef ROCM_CALL +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + const char* err = NULL; \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + rocprofiler_error_string(&err); \ + fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) +#endif + +#ifndef ROCPROFILER_CALL +#define ROCPROFILER_CALL( call, args, handleerror ) \ + do { \ + rocprofiler_status_t _status = (*call##_ptr)args; \ + if(_status != ROCPROFILER_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + handleerror; \ + } \ + } while (0); +#endif +// fprintf(stderr, "Error: %s\n", (*rocprofiler_get_status_string_ptr)(_status)); \ + +#ifndef DECLARE_ROCPROFILER_SDK +#define DECLARE_ROCPROFILER_SDK(funcname, funcsig) rocprofiler_status_t ROCMWEAK funcname funcsig; rocprofiler_status_t ( *funcname##_ptr ) funcsig; +#endif + + +DECLARE_ROCPROFILER_SDK(rocprofiler_create_context, (rocprofiler_context_id_t*)) +DECLARE_ROCPROFILER_SDK(rocprofiler_create_buffer, (rocprofiler_context_id_t, size_t, size_t, rocprofiler_buffer_policy_t, rocprofiler_buffer_tracing_cb_t, void*, rocprofiler_buffer_id_t*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_available_agents, (rocprofiler_agent_version_t, rocprofiler_query_available_agents_cb_t, size_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_get_timestamp, (rocprofiler_timestamp_t* ts)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_counter_info, (rocprofiler_counter_id_t, rocprofiler_counter_info_version_id_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_start_context, (rocprofiler_context_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_stop_context, (rocprofiler_context_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_create_profile_config, (rocprofiler_agent_id_t, rocprofiler_counter_id_t *, size_t, rocprofiler_profile_config_id_t *)); +DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_profile_config, (rocprofiler_profile_config_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_configure_agent_profile_counting_service, (rocprofiler_context_id_t, rocprofiler_buffer_id_t, rocprofiler_agent_id_t, rocprofiler_agent_profile_callback_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_sample_agent_profile_counting_service, (rocprofiler_context_id_t, rocprofiler_user_data_t, rocprofiler_counter_flag_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_iterate_agent_supported_counters, (rocprofiler_agent_id_t, rocprofiler_available_counters_cb_t, void*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_flush_buffer, (rocprofiler_buffer_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_force_configure, (rocprofiler_configure_func_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_destroy_buffer, (rocprofiler_buffer_id_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_context_is_active, (rocprofiler_context_id_t, int*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_create_callback_thread, (rocprofiler_callback_thread_t*)); +DECLARE_ROCPROFILER_SDK(rocprofiler_assign_callback_thread, (rocprofiler_buffer_id_t, rocprofiler_callback_thread_t)); +DECLARE_ROCPROFILER_SDK(rocprofiler_query_record_counter_id, (rocprofiler_counter_instance_id_t id, rocprofiler_counter_id_t* counter_id)); +DECLARE_ROCPROFILER_SDK(rocprofiler_is_initialized, (int*)) + +const char *rocprofiler_get_status_string(rocprofiler_status_t); +const char * (*rocprofiler_get_status_string_ptr)(rocprofiler_status_t); + +#ifndef DECLAREFUNC_HSA +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#endif +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); + + +static int +_rocmon_sdk_link_libraries() +{ + if (rocmon_sdk_dl_hsa_lib && rocmon_sdk_dl_profiler_lib) + { + return 0; + } + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries); + dlerror(); + // Need to link in the ROCm HSA libraries + rocmon_sdk_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + // Delete last error + dlerror(); + rocmon_sdk_dl_profiler_lib = dlopen("librocprofiler-sdk.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_sdk_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler-sdk.so not found: %s, dlerror()); + return -1; + } + } + + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_available_agents); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_timestamp); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_start_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_stop_context); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_profile_config); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_configure_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_iterate_agent_supported_counters); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_flush_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_counter_info); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_sample_agent_profile_counting_service); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_force_configure); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_destroy_buffer); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_context_is_active); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_create_callback_thread); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_assign_callback_thread); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_query_record_counter_id); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_is_initialized); + DLSYM_AND_CHECK(rocmon_sdk_dl_profiler_lib, rocprofiler_get_status_string); + + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_sdk_dl_hsa_lib, hsa_shut_down); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm SDK libraries done); + return 0; +} + + + +typedef struct { + rocprofiler_agent_t *agents; + int num_agents; +} _rocmon_sdk_count_agents_cb_data; + +rocprofiler_status_t _rocmon_sdk_count_agents_cb(rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) +{ + int gpu_agents = 0; + RocmonContext **stat_context = (RocmonContext **)udata; + RocmonContext* context = *stat_context; + RocmonDevice* devices = malloc(num_agents * sizeof(RocmonDevice)); + if (!devices) + { + return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + } + memset(devices, 0, num_agents * sizeof(RocmonDevice)); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Found %d ROCm agents, num_agents); + for(size_t i = 0; i < num_agents; ++i) + { + const rocprofiler_agent_t* in_agent = agents_arr[i]; + if (in_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding AMD GPU at index %d, gpu_agents); + RocmonDevice* device = &devices[gpu_agents]; + device->agent = (rocprofiler_agent_t)*in_agent; + device->deviceId = in_agent->logical_node_type_id; + gpu_agents++; + } + } + context->devices = devices; + context->numDevices = gpu_agents; + return ROCPROFILER_STATUS_SUCCESS; +} + + +typedef struct { + rocprofiler_counter_info_v0_t *counters; + int num_counters; +} _rocmon_sdk_fill_agent_counters_cb_data; + +static void +_rocmon_sdk_free_agent_counters_internal(int num_counters, rocprofiler_counter_info_v0_t* counters) +{ + if ((num_counters < 0) || (!counters)) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing %d counters, num_counters); + for (int i = 0; i < num_counters; i++) + { + rocprofiler_counter_info_v0_t* info = &counters[i]; + if (info) + { + if (info->name) free((char*)info->name); + if (info->description) free((char*)info->description); + if (info->block) free((char*)info->block); + if (info->expression) free((char*)info->expression); + } + } + free(counters); +} + + +rocprofiler_status_t +_rocmon_sdk_fill_agent_counters_cb(rocprofiler_agent_id_t agent, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* udata) +{ + _rocmon_sdk_fill_agent_counters_cb_data *data = (_rocmon_sdk_fill_agent_counters_cb_data*)udata; + + rocprofiler_counter_info_v0_t* out = malloc(num_counters * sizeof(rocprofiler_counter_info_v0_t)); + if (!out) + { + return -ENOMEM; + } + for (int i = 0; i < num_counters; i++) + { + rocprofiler_counter_info_v0_t info; + rocprofiler_status_t stat = (*rocprofiler_query_counter_info_ptr)(counters[i], (rocprofiler_counter_info_version_id_t)ROCPROFILER_COUNTER_INFO_VERSION_0, &info); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to query counter info for %d, i); + for (int j = 0; j < i; j++) + { + free((char*)out[j].name); + free((char*)out[j].description); + } + free(out); + return -EFAULT; + } + //ROCPROFILER_CALL(rocprofiler_query_counter_info, (counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, &info), + /*{ + free(out); + return -EFAULT; + });*/ + int namelen = strlen(info.name)+1; + int desclen = strlen(info.description)+1; + out[i].name = malloc(namelen * sizeof(char)); + if (!out[i].name) + { + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].description = malloc(desclen * sizeof(char)); + if (!out[i].description) + { + free((char*)out[i].name); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].block = malloc((strlen(info.block)+1) * sizeof(char)); + if (!out[i].block) + { + free((char*)out[i].name); + free((char*)out[i].description); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + out[i].expression = malloc((strlen(info.expression)+1) * sizeof(char)); + if (!out[i].expression) + { + free((char*)out[i].name); + free((char*)out[i].description); + free((char*)out[i].block); + _rocmon_sdk_free_agent_counters_internal(i, out); + return -ENOMEM; + } + int ret = 0; + ret = snprintf((char*)out[i].name, namelen-1, "%s", info.name); + ret = snprintf((char*)out[i].description, desclen-1, "%s", info.description); + out[i].id = info.id; + out[i].is_constant = info.is_constant; + out[i].is_derived = info.is_derived; + } + data->counters = out; + data->num_counters = num_counters; + return ROCPROFILER_STATUS_SUCCESS; +} + +int _rocmon_sdk_fill_agent_counters(RocmonDevice *device) +{ + _rocmon_sdk_fill_agent_counters_cb_data fill_data = { + .counters = NULL, + .num_counters = 0, + }; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting counters for agent %d, device->deviceId); + rocprofiler_status_t _status = (rocprofiler_iterate_agent_supported_counters_ptr)(device->agent.id, _rocmon_sdk_fill_agent_counters_cb, &fill_data); + if (_status != ROCPROFILER_STATUS_SUCCESS) + { + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Agent %d provides %d counters, device->deviceId, fill_data.num_counters); + device->sdk_rocMetrics = fill_data.counters; + device->numRocMetrics = fill_data.num_counters; + + return ROCPROFILER_STATUS_SUCCESS; +} + + +static void +_rocmon_sdk_free_agent_counters(RocmonDevice *device) +{ + if (!device->sdk_rocMetrics) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing counters for agent %d, device->deviceId); + _rocmon_sdk_free_agent_counters_internal(device->numRocMetrics, device->sdk_rocMetrics); + device->sdk_rocMetrics = NULL; + device->numRocMetrics = 0; +} + + +typedef struct { + RocmonContext** context; + rocprofiler_context_id_t devcontext; + int devid; +} rocmon_sdk_read_buffers_cb; + +static void +_rocmon_sdk_read_buffers(rocprofiler_context_id_t device_context, + rocprofiler_buffer_id_t buffer, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* udata, + uint64_t) +{ + rocmon_sdk_read_buffers_cb* cbdata = (rocmon_sdk_read_buffers_cb*)udata; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + +/* if (cbdata->result->numResults == 0)*/ +/* {*/ +/* cbdata->result->results = malloc(sizeof(RocmonEventResult))*/ +/* }*/ + printf("_rocmon_sdk_read_buffers\n"); + for (int i = 0; i < num_headers; i++) + { + rocprofiler_record_header_t* h = headers[i]; + if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE) + { + rocprofiler_record_counter_t* r = h->payload; + rocprofiler_counter_id_t cid = {.handle = 0}; + (*rocprofiler_query_record_counter_id_ptr)(r->id, &cid); + for (int j = 0; j < context->numDevices; j++) + { + RocmonDevice *dev = &context->devices[j]; + if (dev->deviceId == cbdata->devid) + { + for (int k = 0; k < dev->numActiveRocEvents; k++) + { + rocprofiler_counter_info_v0_t* cinfo = &dev->sdk_activeRocEvents[k]; + if (cinfo->id.handle == cid.handle) + { + RocmonEventResultList* resultlist = &dev->groupResults[dev->activeGroup]; + resultlist->results[k].fullValue += r->counter_value; + resultlist->results[k].lastValue = resultlist->results[k].fullValue - resultlist->results[k].lastValue; + break; + } + } + break; + } + } + } + } + + +/* RocmonContext* mycontext = *cbdata->context;*/ +/* for (int i = 0; i < mycontext->numDevices; i++)*/ +/* {*/ +/* RocmonDevice* device = &mycontext->devices[i];*/ +/* if (device->agent.id.handle == cbdata->agent.id.handle)*/ +/* {*/ +/* RocmonEventResultList* groupResults = &device->groupResults[device->activeGroup];*/ + +/* for(int i = 0; i < num_headers; ++i)*/ +/* {*/ +/* rocprofiler_record_header_t* h = headers[i];*/ +/* if(h->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && h->kind == ROCPROFILER_COUNTER_RECORD_VALUE)*/ +/* {*/ +/* rocprofiler_record_counter_t* r = h->payload;*/ +/* if (r->id >= 0 && r->id < groupResults->numResults)*/ +/* {*/ +/* RocmonEventResult* eventResult = &cbdata->result->results[r->id];*/ +/* double diff = r->counter_value - eventResult->fullValue;*/ +/* eventResult->lastValue = eventResult->fullValue;*/ +/* eventResult->fullValue += diff;*/ +/* }*/ +/* }*/ +/* }*/ +/* }*/ +/* }*/ + + return; +} + + +static int _rocmon_sdk_create_devices(RocmonContext** stat_context) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + RocmonContext* context = *stat_context; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents); + stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), stat_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to query available agents); + return -EFAULT; + } + if (context->numDevices == 0) + { + FREE_IF_NOT_NULL(context->devices); + return -1; + } + for (int i = 0; i < context->numDevices; i++) + { + rocprofiler_context_id_t device_context; + rocprofiler_buffer_id_t buffer; + rocprofiler_callback_thread_t thread; + RocmonDevice* device = &context->devices[i]; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId); + stat = (*rocprofiler_create_context_ptr)(&device_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId); + rocmon_sdk_read_buffers_cb devdata = { + .context = stat_context, + .devid = device->deviceId, + .devcontext = device_context + }; + stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, &devdata, &buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId); + stat = (*rocprofiler_create_callback_thread_ptr)(&thread); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId); + stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + errno = EFAULT; + ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(context->devices); + return -EFAULT; + } + + device->sdk_context = device_context; + device->buffer = buffer; + device->thread = thread; + } + return 0; +} + +int +tool_init(rocprofiler_client_finalize_t fini, void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_init); + + // initialize libraries + if (_rocmon_sdk_link_libraries() < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return -EFAULT; + } + return _rocmon_sdk_create_devices(stat_context); + +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initialize HSA);*/ +/* hsa_status_t hstat = (*hsa_init_ptr)();*/ +/* if (hstat != HSA_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to initialize HSA);*/ +/* return -EFAULT;*/ +/* }*/ + + //ROCPROFILER_CALL(rocprofiler_query_available_agents, (ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), &agent_count), return -EFAULT;); +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Querying available agents);*/ +/* stat = (*rocprofiler_query_available_agents_ptr)(ROCPROFILER_AGENT_INFO_VERSION_0, _rocmon_sdk_count_agents_cb, sizeof(rocprofiler_agent_t), udata);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to query available agents);*/ +/* return -EFAULT;*/ +/* }*/ +/* if (context->numDevices == 0)*/ +/* {*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -1;*/ +/* }*/ + +/* for (int i = 0; i < context->numDevices; i++)*/ +/* {*/ +/* rocprofiler_context_id_t device_context;*/ +/* rocprofiler_buffer_id_t buffer;*/ +/* rocprofiler_callback_thread_t thread;*/ +/* RocmonDevice* device = &context->devices[i];*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating context for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_create_context_ptr)(&device_context);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating buffer for device %d, device->deviceId);*/ +/* rocmon_sdk_read_buffers_cb devdata = {*/ +/* .context = stat_context,*/ +/* .devid = device->deviceId,*/ +/* .devcontext = device_context*/ +/* };*/ +/* stat = (*rocprofiler_create_buffer_ptr)(device_context, 100, 50, ROCPROFILER_BUFFER_POLICY_LOSSLESS, _rocmon_sdk_read_buffers, &devdata, &buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating callback thread for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_create_callback_thread_ptr)(&thread);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Assign callback thread to buffer for device %d, device->deviceId);*/ +/* stat = (*rocprofiler_assign_callback_thread_ptr)(buffer, thread);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* errno = EFAULT;*/ +/* ERROR_PRINT(Failed to create callback thread for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* FREE_IF_NOT_NULL(context->devices);*/ +/* return -EFAULT;*/ +/* }*/ +/* */ +/* device->sdk_context = device_context;*/ +/* device->buffer = buffer;*/ +/* device->thread = thread;*/ +/* }*/ +/* return 0;*/ +} + + +void +tool_fini(void* udata) +{ + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Running tool_fini); + RocmonContext** stat_context = (RocmonContext**)udata; + RocmonContext* context = *stat_context; + if ((!context) || (!context->devices) || (context->numDevices == 0)) + { + return; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + int active = 0; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active_ptr)(device->sdk_context, &active); + if (active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping context for device %d, device->deviceId); + stat = (*rocprofiler_stop_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to stop context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Flushing buffer for device %d, device->deviceId); + stat = (*rocprofiler_flush_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroying buffer for device %d, device->deviceId); + stat = (*rocprofiler_destroy_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to destroy buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + _rocmon_sdk_free_agent_counters(device); + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); + (*hsa_shut_down_ptr)(); +} + +void +_rocmon_sdk_set_profile(rocprofiler_context_id_t context_id, + rocprofiler_agent_id_t agent, + rocprofiler_agent_set_profile_callback_t set_config, + void* udata) +{ + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_sdk_set_profile); + RocmonDevice* device = (RocmonDevice*) udata; + if (device->agent.id.handle == agent.handle) + { + if (device->activeGroup >= 0 && device->activeGroup < device->numProfiles) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Setting profile %d for device %d, device->activeGroup, device->deviceId); + set_config(context_id, device->profiles[device->activeGroup]); + } + else + { + ERROR_PRINT(Invalid active group for device %d, device->deviceId); + } + } + else + { + ERROR_PRINT(Mismatch between device %s agent and given agent, device->deviceId); + } + return; +} + + + +rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* client_id) +{ + client_id->name = "LIKWID"; + if (!rocmon_context) + { + rocmon_context = malloc(sizeof(RocmonContext)); + if (!rocmon_context) + { + return NULL; + } + memset(rocmon_context, 0, sizeof(RocmonContext)); + } + static rocprofiler_tool_configure_result_t config_result = { + .size = sizeof(rocprofiler_tool_configure_result_t), + .initialize = tool_init, + .finalize = tool_fini, + .tool_data = &rocmon_context, + }; + DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing Rocprofiler SDK); + return &config_result; +} + +int +rocmon_sdk_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + int ret = 0; + rocprofiler_context_id_t text_context; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + if ((numGpus < 0) || (!gpuIds) || (!context)) + { + return -EINVAL; + } + if (rocmon_sdk_initialized) + { + return 0; + } + + // initialize libraries + ret = _rocmon_sdk_link_libraries(); + if (ret < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + +/* stat = (*rocprofiler_force_configure_ptr)(rocprofiler_configure);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to configure rocprofiler: %s, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ +/* stat = (*rocprofiler_create_context_ptr)(&text_context);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to create test context: %s, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* return -EFAULT;*/ +/* }*/ + ret = _rocmon_sdk_create_devices(&rocmon_context); + if (ret < 0) + { + ERROR_PRINT(Failed to create SDK devices); + return ret; + } + + if (context->numDevices == 0) + { + errno = ENODEV; + ERROR_PRINT(Cannot find any ROCm GPUs); + return -ENODEV; + } + + RocmonDevice* devices = malloc(numGpus * sizeof(RocmonDevice)); + if (!devices) + { + return -ENOMEM; + } + memset(devices, 0, numGpus * sizeof(RocmonDevice)); + + for (int i = 0; i < numGpus; i++) + { + int idx = -1; + for (int j = 0; j < context->numDevices; j++) + { + RocmonDevice* device = &context->devices[j]; + if (gpuIds[i] == device->deviceId) + { + idx = j; + break; + } + } + if (idx >= 0) + { + memcpy(&devices[i], &context->devices[idx], sizeof(RocmonDevice)); + RocmonDevice* out = &devices[i]; +/* RocmonDevice* in = &context->devices[idx];*/ +/* out->agent = in->agent;*/ +/* printf("%d -> %d\n", in->agent.id.handle, out->agent.id.handle);*/ +/* out->thread = in->thread;*/ +/* out->buffer = in->buffer;*/ +/* printf("%d -> %d\n", in->buffer.handle, out->buffer.handle);*/ +/* out->sdk_context = in->sdk_context;*/ +/* printf("%d -> %d\n", in->sdk_context.handle, out->sdk_context.handle);*/ + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Fill agent counters for device %d, out->deviceId); + ret = _rocmon_sdk_fill_agent_counters(out); + if (ret < 0) + { + errno = -ret; + ERROR_PRINT(Failed to fill events for device %d: %s, out->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + } + else + { + errno = ENODEV; + ERROR_PRINT(Cannot find ROCm GPU %d, gpuIds[i]); + free(devices); + return -ENODEV; + } + } + free(context->devices); + context->devices = devices; + context->numDevices = numGpus; + + rocmon_sdk_initialized = TRUE; + return 0; +} + + +void +rocmon_sdk_finalize(RocmonContext* context) +{ + if (context) + { + if (context->devices) + { + for (int i = 0; i < context->numDevices; i++) + { + //free device i + RocmonDevice* dev = &context->devices[i]; + if (dev->sdk_activeRocEvents) + { + free(dev->sdk_activeRocEvents); + dev->sdk_activeRocEvents = NULL; + dev->numActiveRocEvents = 0; + } + if (dev->sdk_rocMetrics) + { + _rocmon_sdk_free_agent_counters_internal(dev->numRocMetrics, dev->sdk_rocMetrics); + dev->sdk_rocMetrics = NULL; + dev->numRocMetrics = 0; + } + if (dev->profiles) + { + for (int i = 0; i < dev->numProfiles; i++) + { + (*rocprofiler_destroy_profile_config_ptr)(dev->profiles[i]); + } + } + } + } +/* if (context->sdk_agents)*/ +/* {*/ +/* free(context->sdk_agents);*/ +/* context->sdk_agents = NULL;*/ +/* free(context->sdk_agent_buffers);*/ +/* context->sdk_agent_buffers = NULL;*/ +/* context->num_sdk_agents = 0;*/ +/* }*/ + } + rocmon_sdk_initialized = 0; + return; +} + + + +static int +_rocmon_setupCounters_rocprofiler_sdk(RocmonDevice* device, const char** events, int numEvents) +{ + rocprofiler_profile_config_id_t profile; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + if ((!device) || (!events) || (numEvents <= 0)) + { + return -EINVAL; + } + + int num_counters = 0; + rocprofiler_counter_id_t* counters = malloc(numEvents * sizeof(rocprofiler_counter_id_t)); + if (!counters) + { + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + int found = -1; + for (int j = 0; j < device->numRocMetrics; j++) + { + rocprofiler_counter_info_v0_t* m = &device->sdk_rocMetrics[j]; + if (strncmp(events[i], m->name, strlen(m->name)) == 0) + { + found = j; + break; + } + } + if (found >= 0) + { + counters[num_counters++] = device->sdk_rocMetrics[found].id; + } + else + { + ERROR_PRINT(Unknown ROCm event %s, events[i]); + } + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating profile for %d event(s) for device %d, num_counters, device->deviceId); + stat = (*rocprofiler_create_profile_config_ptr)(device->agent.id, counters, num_counters * sizeof(rocprofiler_counter_id_t), &profile); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to create profile: %s, (*rocprofiler_get_status_string_ptr)(stat)); + FREE_IF_NOT_NULL(counters); + return -ENOMEM; + } + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Increasing profile space to %d for device %d, device->numProfiles + 1, device->deviceId); + rocprofiler_profile_config_id_t* profiles = realloc(device->profiles, (device->numProfiles+1) * sizeof(rocprofiler_profile_config_id_t)); + if (!profiles) + { + (*rocprofiler_destroy_profile_config_ptr)(profile); + FREE_IF_NOT_NULL(counters); + return -ENOMEM; + } + device->profiles = profiles; + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding profile %d at idx %d for device %d, device->numProfiles, device->numProfiles, device->deviceId); + device->profiles[device->numProfiles++] = profile; + FREE_IF_NOT_NULL(counters); + return 0; +} + +int +rocmon_sdk_setupCounters(RocmonContext* context, int gid) +{ + int ret = 0; + int numRocEvents = 0; + const char **rocEvents = NULL; + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // Allocate memory for string arrays + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + } + if (numRocEvents == 0) + { + free(rocEvents); + return 0; + } + + // Add events to each device + //rocmon_context->activeGroup = gid; + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler_sdk(device, rocEvents, numRocEvents); + if (ret < 0) + { + if (rocEvents) free(rocEvents); + return ret; + } + + } + // Cleanup + free(rocEvents); + + return 0; +} + +static int _rocmon_sdk_get_timestamp(uint64_t* timestamp) +{ + rocprofiler_timestamp_t ts; + rocprofiler_status_t stat = (*rocprofiler_get_timestamp_ptr)(&ts); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to get timestamp: %s, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + + + *timestamp = (uint64_t) ts; + return 0; +} + +static int +_rocmon_startCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + //ROCPROFILER_CALL(rocprofiler_configure_agent_profile_counting_service, (device->sdk_context, device->buffer, device->agent.id, _rocmon_sdk_set_profile, NULL), \ + //ROCPROFILER_CALL(rocprofiler_destroy_profile_config, (profile), free(counters); return -EFAULT;); \ + free(counters); return -ENOMEM); + + // if not running + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (!active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Configuring counting service for device %d, device->deviceId); + stat = (*rocprofiler_configure_agent_profile_counting_service_ptr)(device->sdk_context, device->buffer, device->agent.id, _rocmon_sdk_set_profile, device); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to configure counting service for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting context for device %d, device->deviceId); + stat = (*rocprofiler_start_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to start ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + } + return 0; +} + +int +rocmon_sdk_startCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t timestamp = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + + // Get timestamp + if (ret = _rocmon_sdk_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (active) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping context for device %d, device->deviceId); + stat = (*rocprofiler_stop_context_ptr)(device->sdk_context); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to stop ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } +/* stat = (*rocprofiler_flush_buffer_ptr)(device->buffer);*/ +/* if (stat != ROCPROFILER_STATUS_SUCCESS)*/ +/* {*/ +/* ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat));*/ +/* }*/ + } + return 0; +} + +int +rocmon_sdk_stopCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t t = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + ERROR_PRINT(Rocmon SDK not initialized); + return -EFAULT; + } + // Read counters + ret = _rocmon_sdk_get_timestamp(&t); + if (ret < 0) + { + return ret; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + device->time.stop = t; + } + + return 0; +} + +static int +_rocmon_readCounters_rocprofiler_sdk(RocmonDevice* device) +{ + int active = 0; + rocprofiler_status_t stat = ROCPROFILER_STATUS_SUCCESS; + // do read + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Checking context for device %d, device->deviceId); + stat = (*rocprofiler_context_is_active)(device->sdk_context, &active); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to check ROCm context for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + } + if (active) + { + rocprofiler_user_data_t udata = { + .value = 0, + .ptr = NULL, + }; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Sampling counting service for device %d, device->deviceId); + stat = (*rocprofiler_sample_agent_profile_counting_service_ptr)(device->sdk_context, udata, ROCPROFILER_COUNTER_FLAG_NONE); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to sample counting service for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + } + else + { + ERROR_PRINT(Device context for device %d not active, device->deviceId); + } + stat = (*rocprofiler_flush_buffer_ptr)(device->buffer); + if (stat != ROCPROFILER_STATUS_SUCCESS) + { + ERROR_PRINT(Failed to flush buffer for device %d: %s, device->deviceId, (*rocprofiler_get_status_string_ptr)(stat)); + return -EFAULT; + } + return 0; +} + + +int +rocmon_sdk_readCounters(RocmonContext* context) +{ + int ret = 0; + uint64_t t = 0; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + return -EFAULT; + } + ret = _rocmon_sdk_get_timestamp(&t); + if (ret < 0) + { + return ret; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Read counters + ret = _rocmon_readCounters_rocprofiler_sdk(device); + if (ret < 0) return ret; + device->time.read = t; + } + + return 0; +} + + + + +int +rocmon_sdk_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_sdk_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &context->devices[gpuIdx]; + + if (*list) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reusing existing event list); + tmpList = *list; + } + else + { + // Allocate list structure + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Allocate new event list); + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + tmpList->numEvents = 0; + tmpList->events = NULL; + } + + // Get number of events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Add %d RocProfiler SDK events, device->numRocMetrics); + if (device->numRocMetrics == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + device->numRocMetrics) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_counter_info_v0_t* event = &device->sdk_rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->name); + } + + // Copy description + len = strlen(event->description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->description); + } + tmpList->numEvents++; + } + *list = tmpList; + return 0; +} + + + + +int +rocmon_sdk_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_sdk_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_sdk_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_sdk_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + + + + + +#endif /* LIKWID_ROCMON_SDK_H */ + diff --git a/src/includes/rocmon_sdk_types.h b/src/includes/rocmon_sdk_types.h new file mode 100644 index 000000000..7c8da13fb --- /dev/null +++ b/src/includes/rocmon_sdk_types.h @@ -0,0 +1,70 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_sdk_types.h + * + * Description: Header File of rocmon sdk module for ROCM >= 6.2 + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SDK_TYPES_H +#define LIKWID_ROCMON_SDK_TYPES_H + +#include +/*#ifdef ROCPROFILER_EXPORT*/ +/*#undef ROCPROFILER_EXPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_IMPORT*/ +/*#undef ROCPROFILER_IMPORT*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MAJOR*/ +/*#undef ROCPROFILER_VERSION_MAJOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_VERSION_MINOR*/ +/*#undef ROCPROFILER_VERSION_MINOR*/ +/*#endif*/ +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include +/*#ifdef ROCPROFILER_API*/ +/*#undef ROCPROFILER_API*/ +/*#endif*/ +#include + + +typedef struct { + rocprofiler_agent_t* agent; + rocprofiler_buffer_id_t buffer; + rocprofiler_context_id_t context; + RocmonEventResultList *result; +} RocprofilerSdkAgentData; + +typedef struct { + int num_agents; + RocprofilerSdkAgentData* agents; +} RocprofilerSdkData; + + + +#endif /* LIKWID_ROCMON_SDK_TYPES_H */ diff --git a/src/includes/rocmon_smi.h b/src/includes/rocmon_smi.h new file mode 100644 index 000000000..d40990a64 --- /dev/null +++ b/src/includes/rocmon_smi.h @@ -0,0 +1,1192 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_smi.h + * + * Description: Header File of rocmon module for ROCm SMI. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SMI_H +#define LIKWID_ROCMON_SMI_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +static void *rocmon_dl_rsmi_lib = NULL; + +static int rocmon_smi_initialized = 0; + +#ifndef RSMI_CALL +#define RSMI_CALL( call, args, handleerror ) \ + do { \ + rsmi_status_t _status = (*call##_ptr)args; \ + if (_status != RSMI_STATUS_SUCCESS) { \ + fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ + handleerror; \ + } \ + } while (0) +#endif + +#ifndef DECLAREFUNC_SMI +#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; +#endif + +DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); +DECLAREFUNC_SMI(rsmi_shut_down, ()); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); +DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); +DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); +DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); +DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); +DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); +DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); +DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); +DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); +DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); +DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); +DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); +DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); +DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); +DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); +DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); +DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); +DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); + + +// ---------------------------------------------------- +// SMI event wrapper +// ---------------------------------------------------- + +static int +_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t value; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); + // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size + if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); + else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); + else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); + else return -1; + + result->fullValue += value; + result->lastValue = value; + + return 0; +} + + +static int +_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t counter; + RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); + result->fullValue += counter; + result->lastValue = counter; + + return 0; +} + + +static int +_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t power; + RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); + result->fullValue += power; + result->lastValue = power; + + return 0; +} + + +static int +_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t total; + RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); + result->fullValue += total; + result->lastValue = total; + + return 0; +} + + +static int +_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint64_t used; + RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); + result->fullValue += used; + result->lastValue = used; + + return 0; +} + + +static int +_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t percent; + RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); + result->fullValue += percent; + result->lastValue = percent; + + return 0; +} + + +static int +_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_pages; + RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); + result->fullValue += num_pages; + result->lastValue = num_pages; + + return 0; +} + + +static int +_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t speed; + RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); + result->fullValue += speed; + result->lastValue = speed; + + return 0; +} + + +static int +_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t max_speed; + RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); + result->fullValue += max_speed; + result->lastValue = max_speed; + + return 0; +} + + +static int +_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t temperature; + RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); + result->fullValue += temperature; + result->lastValue = temperature; + + return 0; +} + + +static int +_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + int64_t voltage; + RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); + result->fullValue += voltage; + result->lastValue = voltage; + + return 0; +} + + +static int +_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t overdrive; + RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); + result->fullValue += overdrive; + result->lastValue = overdrive; + + return 0; +} + + +static int +_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + rsmi_error_count_t error_count; + RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); + + if (event->extra == 0) + { + result->lastValue = error_count.correctable_err - result->fullValue; + result->fullValue = error_count.correctable_err; + } + else if (event->extra == 1) + { + result->lastValue = error_count.uncorrectable_err - result->fullValue; + result->fullValue = error_count.uncorrectable_err; + } + else + { + return -1; + } + + return 0; +} + + +static int +_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) +{ + uint32_t num_items; + RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); + result->fullValue += num_items; + result->lastValue = num_items; + + return 0; +} + + +static int +_rocmon_smi_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD SMI libraries); + + // Need to link in the Rocprofiler libraries + rocmon_dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_dl_rsmi_lib) + { + ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); + return -1; + } + + // Link SMI functions + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_init); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_shut_down); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_func_iter_value_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_func_iter_next); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_power_ave_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_pci_throughput_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_total_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_usage_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_rpms_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_speed_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_fan_speed_max_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_temp_metric_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_volt_metric_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_overdrive_level_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_dev_ecc_count_get); + DLSYM_AND_CHECK(rocmon_dl_rsmi_lib, rsmi_compute_process_info_get); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); + return 0; +} + + + + +// ---------------------------------------------------- +// Rocmon SMI helper functions +// ---------------------------------------------------- + +static bstring +_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) +{ + switch (type) + { + case ROCMON_SMI_EVENT_TYPE_NORMAL: + return bfromcstr(funcname); + case ROCMON_SMI_EVENT_TYPE_VARIANT: + return bformat("%s|%" PRIu64, funcname, variant); + case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: + return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); + case ROCMON_SMI_EVENT_TYPE_INSTANCES: + return bfromcstr(funcname); + } +} + + +static int +_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) +{ + int ret; + + // Get event by label + RocmonSmiEventList* list = NULL; + bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); + ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); + bdestroy(label); + if (ret < 0) + { + // Event not registered -> ignore + return 0; + } + + // For events with multiple sensor, only make one entry -> find if one exists + if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) + { + // Get list from map + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + RocmonSmiEvent* existingEvent = NULL; + ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); + if (ret < 0) + { + ERROR_PRINT(Failed to find previous instance for event %s, event->name); + return -1; + } + + // Update instance information + existingEvent->instances++; + } + return 0; + } + + for (int i = 0; i < list->numEntries; i++) + { + RocmonSmiEvent* event = &list->entries[i]; + + // Allocate memory for device event description + RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); + if (tmpEvent == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); + return -ENOMEM; + } + + // Copy information from global description + memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); + tmpEvent->variant = variant; + tmpEvent->subvariant = subvariant; + tmpEvent->instances = 1; + + // Save event info to device event map + add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); + } + + return 0; +} + + +static int +_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) +{ + rsmi_func_id_iter_handle_t sub_var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open subvariants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No subvariants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); + if (ret < 0) return -1; + return 0; + } + + // Subvariants available -> iterate them + do { + // Get subvariant information + (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); + + // Process info + if (variant == RSMI_DEFAULT_VARIANT) + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); + else + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); + if (ret < 0) return ret; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(sub_var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) +{ + rsmi_func_id_iter_handle_t var_iter; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Get open variants iterator + status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); + if (status == RSMI_STATUS_NO_DATA) + { + // No variants + ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + return 0; + } + + // Variants available -> iterate them + do { + // Get variant information + (*rsmi_func_iter_value_get_ptr)(var_iter, &value); + + // Get function subvariants + ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); + if (ret < 0) return -1; + + // Advance iterator + status = (*rsmi_func_iter_next_ptr)(var_iter); + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); + + return 0; +} + + +static int +_rocmon_smi_get_functions(RocmonDevice* device) +{ + rsmi_func_id_iter_handle_t iter_handle; + rsmi_func_id_value_t value; + rsmi_status_t status; + int ret; + + // Open iterator + //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { + return -1; + }); + + do + { + // Get function information + //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); + RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { + ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + }); + + // Get function variants + ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); + if (ret < 0) + { + ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + return -1; + } + + // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, + // so we check that the function pointer exists to avoid segfaults.) + if (rsmi_func_iter_next_ptr) { + status = (*rsmi_func_iter_next_ptr)(iter_handle); + } + } while (status != RSMI_STATUS_NO_DATA); + + // Close iterator + //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); + RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); + + // Add device independent functions + ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); + if (ret < 0) return -1; + + return 0; +} + + + +static int +_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) +{ + // Add new event list to map (if not already present) + bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); + RocmonSmiEventList* list; + if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) + { + // Allocate memory for event list + list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); + if (list == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); + return -ENOMEM; + } + list->entries = NULL; + list->numEntries = 0; + + add_smap(rocmon_context->smiEvents, bdata(label), list); + } + bdestroy(label); + + // Allocate memory for another event in list + list->numEntries++; + list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); + if (list->entries == NULL) + { + ERROR_PRINT(Failed to allocate memory for SMI event %s, name); + return -ENOMEM; + } + + // Set event properties + RocmonSmiEvent* event = &list->entries[list->numEntries-1]; + strncpy(event->name, name, sizeof(event->name)); + event->name[sizeof(event->name)] = '\0'; + event->type = type; + event->variant = variant; + event->subvariant = subvariant; + event->extra = extra; + event->instances = 0; // gets set when scanning supported device functions + event->measureFunc = measureFunc; + + return 0; +} + +#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } +#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) +#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) +#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) +#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) + + +static void +_rcomon_smi_free_event_list(void* vlist) +{ + RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; + if (list) + { + FREE_IF_NOT_NULL(list->entries); + free(list); + } +} + + +static int +_rocmon_smi_init_events(RocmonContext* context) +{ + int ret; + + // Init map + ret = init_map(&context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); + if (ret < 0) + { + ERROR_PRINT(Failed to create map for ROCm SMI events); + return ret; + } + + // Add events + ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); + ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); + ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); + ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); + ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); + ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); + ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); + ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); + ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); + ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); + ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); + ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); + + return 0; +} + +static int +_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) +{ + int ret; + const int instanceNumLen = 5; + + // Delete previous events + if (device->activeSmiEvents) + { + free(device->activeSmiEvents); + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create event array + RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); + if (activeEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate active event list); + return -ENOMEM; + } + + for (int i = 0; i < numEvents; i++) + { + char eventName[MAX_ROCMON_SMI_EVENT_NAME]; + int instance = -1; + + // Parse event name -> normal event vs one with multiple instances (EVENT[0]) + const char* event = events[i]; + char* instancePart = strrchr(event, '['); + if (instancePart != NULL) + { + char withoutBrackets[instanceNumLen+1]; // +1 is '\0' + int partlen = strlen(instancePart); + + // Check if number fit in 'withoutBrackets' + if (partlen - 2 > instanceNumLen) + { + ERROR_PRINT(Instance number in '%s' is too large, event); + free(activeEvents); + return -EINVAL; + } + + // Copy instance number without brackets + strncpy(withoutBrackets, instancePart+1, partlen-2); + withoutBrackets[instanceNumLen] = '\0'; + + // Parse instance as number + char* endParsed; + instance = strtol(withoutBrackets, &endParsed, 10); + + // Check if parsing was successful + char* endOfString = &withoutBrackets[partlen-2]; + if (endParsed != endOfString) + { + ERROR_PRINT(Failed to parse instance number in '%s', event); + free(activeEvents); + return -EINVAL; + } + + // Copy event name without instance + int eventNameLen = instancePart - event; + strncpy(eventName, event, eventNameLen); + eventName[eventNameLen] = '\0'; + } + else + { + // Copy entire event name + strncpy(eventName, event, MAX_ROCMON_SMI_EVENT_NAME); + } + + // Lookup event in available events + RocmonSmiEvent* metric = NULL; + ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); + if (ret < 0) + { + ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); + free(activeEvents); + return -EINVAL; + } + + // Copy event + RocmonSmiEvent* tmpEvent = &activeEvents[i]; + memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); + + // Check if event supports instances + if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event requires instances + if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) + { + ERROR_PRINT(No instance number given but event '%s' requires one, eventName); + free(activeEvents); + return -EINVAL; + } + + // Check if event has enough instances + if (instance >= 0 && instance >= metric->instances) + { + ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); + free(activeEvents); + return -EINVAL; + } + + // Set instance number + if (instance >= 0) + { + tmpEvent->subvariant = instance; + } + } + + device->activeSmiEvents = activeEvents; + device->numActiveSmiEvents = numEvents; + + return 0; +} + + +int +rocmon_smi_setupCounters(RocmonContext* context, int gid) +{ + int ret = 0; + int numSmiEvents = 0; + const char **smiEvents = NULL; + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // Allocate memory for string arrays + smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (smiEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate smiEvents name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "RSMI_", 5) == 0) + { + // Rocprofiler event + smiEvents[numSmiEvents] = name + 5; // +5 removes 'ROCP_' prefix + numSmiEvents++; + } + } + if (numSmiEvents == 0) + { + free(smiEvents); + return 0; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); + if (ret < 0) + { + ERROR_PRINT(Failed to setup ROCMON SMI events for device %d, i); + } + } + free(smiEvents); + return 0; +} + +int +rocmon_smi_readCounters(RocmonContext* context) +{ + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + if (context->activeGroup < 0) + { + return -EFAULT; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveSmiEvents; i++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[i]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + } + } + return 0; +} + +int +rocmon_smi_startCounters(RocmonContext* context) +{ + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + if (context->activeGroup < 0) + { + return -EFAULT; + } + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + fprintf(stderr, "Device %d with %d SMI events\n", device->deviceId, device->numActiveSmiEvents); + // Check if there are any counters to start + if (device->numActiveSmiEvents <= 0) + { + return 0; + } + + // Save baseline values + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int j = 0; j < device->numActiveSmiEvents; j++) + { + double value = 0; + RocmonSmiEvent* event = &device->activeSmiEvents[j]; + RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+j]; + + // Measure counter + if (event->measureFunc) + { + event->measureFunc(device->deviceId, event, result); + } + + // Save value + result->fullValue = 0; + } + } + return 0; +} + +int +rocmon_smi_stopCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + return 0; +} + + +static int +rocmon_smi_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_smi_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > rocmon_context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &rocmon_context->devices[gpuIdx]; + + if (*list) + { + tmpList = *list; + } + else + { + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + memset(tmpList, 0, sizeof(EventList_rocm)); + } + + // Get number of events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Add %d ROCm SMI events, get_map_size(device->smiMetrics)); + if (get_map_size(device->smiMetrics) == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + get_map_size(device->smiMetrics)) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy ROCm SMI metric information + for (int i = 0; i < get_map_size(device->smiMetrics); i++) + { + RocmonSmiEvent* event = NULL; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Get event + if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) + { + continue; + } + + // Copy name + len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "RSMI_%s", event->name); + } + + // Copy description + char* description = "SMI Event"; // TODO: use real descriptions + len = strlen(description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", description); + } + + // Copy instances + out->instances = event->instances; + tmpList->numEvents++; + } + + *list = tmpList; + return 0; +} + + +int rocmon_smi_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + int ret = 0; + if ((!context) || (numGpus <= 0) || (!gpuIds)) + { + return -EINVAL; + } + + ret = _rocmon_smi_link_libraries(); + if (ret < 0) + { + return -EFAULT; + } + + // init rocm smi library + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); + RSMI_CALL(rsmi_init, (0), + { + ERROR_PLAIN_PRINT(Failed to init rocm_smi); + goto rocmon_init_rsmi_failed; + }); + + // Get available SMI events for devices + _rocmon_smi_init_events(context); + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice *device = &context->devices[i]; + // Initialize SMI events map + if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) + { + ERROR_PLAIN_PRINT(Cannot init smiMetrics map); + goto rocmon_init_rsmi_failed; + } + if (_rocmon_smi_get_functions(device) < 0) + { + ERROR_PRINT(Failed to get SMI functions for device %d, device->deviceId); + goto rocmon_init_rsmi_failed; + } + device->activeSmiEvents = NULL; + device->smiMetrics = NULL; + } + rocmon_smi_initialized = TRUE; + return 0; +rocmon_init_rsmi_failed: + RSMI_CALL(rsmi_shut_down, (), { + // fall through + }); + return 0; +} + + +void rocmon_smi_finalize(RocmonContext* context) +{ + if (!rocmon_smi_initialized) + { + return; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON SMI); + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (device->activeSmiEvents) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing active SMI events for device %d, device->deviceId); + free(device->activeSmiEvents); + device->activeSmiEvents = NULL; + device->numActiveSmiEvents = 0; + } + if (device->smiMetrics) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing SMI event list for device %d, device->deviceId); + destroy_smap(device->smiMetrics); + device->smiMetrics = NULL; + } + } + } + if (context->smiEvents) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Freeing SMI event list); + destroy_smap(context->smiEvents); + context->smiEvents = NULL; + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown RSMI); + RSMI_CALL(rsmi_shut_down, (), { + ERROR_PRINT(Shutdown SMI failed); + // fall through + }); + rocmon_smi_initialized = FALSE; +} + +int +rocmon_smi_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_smi_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_smi_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_smi_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + +#endif /* LIKWID_ROCMON_SMI_H */ diff --git a/src/includes/rocmon_smi_types.h b/src/includes/rocmon_smi_types.h new file mode 100644 index 000000000..cb6a5efae --- /dev/null +++ b/src/includes/rocmon_smi_types.h @@ -0,0 +1,81 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_smi_types.h + * + * Description: Header File of rocmon for smi backend. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_SMI_TYPES_H +#define LIKWID_ROCMON_SMI_TYPES_H + +#include +#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 +typedef struct metrics_table_header_t metrics_table_header_t; +#endif +#include +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include + +struct RocmonSmiEvent_struct; +typedef int (*RocmonSmiMeasureFunc)(int deviceId, struct RocmonSmiEvent_struct* event, RocmonEventResult* result); + +typedef enum { + ROCMON_SMI_EVENT_TYPE_NORMAL = 0, + ROCMON_SMI_EVENT_TYPE_VARIANT, + ROCMON_SMI_EVENT_TYPE_SUBVARIANT, + ROCMON_SMI_EVENT_TYPE_INSTANCES +} RocmonSmiEventType; + +#define MAX_ROCMON_SMI_EVENT_NAME 40 +typedef struct RocmonSmiEvent_struct { + char name[MAX_ROCMON_SMI_EVENT_NAME]; + uint64_t variant; + uint64_t subvariant; + uint64_t extra; + int instances; + RocmonSmiEventType type; + RocmonSmiMeasureFunc measureFunc; +} RocmonSmiEvent; + +typedef struct { + RocmonSmiEvent* entries; + int numEntries; +} RocmonSmiEventList; + +#endif /* LIKWID_ROCMON_SMI_TYPES_H */ diff --git a/src/includes/rocmon_types.h b/src/includes/rocmon_types.h deleted file mode 100644 index 7af2e1518..000000000 --- a/src/includes/rocmon_types.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * ======================================================================================= - * - * Filename: nvmon_types.h - * - * Description: Header File of nvmon module. - * Configures and reads out performance counters - * on NVIDIA GPUs. Supports multi GPUs. - * - * Version: - * Released: - * - * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com - * Project: likwid - * - * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - * - * ======================================================================================= - */ -#ifndef LIKWID_ROCMON_TYPES_H -#define LIKWID_ROCMON_TYPES_H - -#include -// #include -#ifndef ROCPROFILER_VERSION_MAJOR -#include -#endif -#include - -typedef struct { - double lastValue; - double fullValue; -} RocmonEventResult; - -typedef struct { - RocmonEventResult* results; // First rocprofiler results, then SMI results - int numResults; -} RocmonEventResultList; - - - -struct RocmonSmiEvent_struct; -typedef int (*RocmonSmiMeasureFunc)(int deviceId, struct RocmonSmiEvent_struct* event, RocmonEventResult* result); - -typedef enum { - ROCMON_SMI_EVENT_TYPE_NORMAL = 0, - ROCMON_SMI_EVENT_TYPE_VARIANT, - ROCMON_SMI_EVENT_TYPE_SUBVARIANT, - ROCMON_SMI_EVENT_TYPE_INSTANCES -} RocmonSmiEventType; - -typedef struct RocmonSmiEvent_struct { - char name[40]; - uint64_t variant; - uint64_t subvariant; - uint64_t extra; - int instances; - RocmonSmiEventType type; - RocmonSmiMeasureFunc measureFunc; -} RocmonSmiEvent; - -typedef struct { - RocmonSmiEvent* entries; - int numEntries; -} RocmonSmiEventList; - -typedef struct { - int deviceId; // LIKWID device id - - hsa_agent_t hsa_agent; // HSA agent handle for this device - rocprofiler_t* context; // Rocprofiler context (has activeEvents configured) - - // Available rocprofiler metrics - rocprofiler_info_data_t* rocMetrics; - int numRocMetrics; - - // Available ROCm SMI events - Map_t smiMetrics; - - // Currently configured rocprofiler events (bound to context) - rocprofiler_feature_t* activeRocEvents; - int numActiveRocEvents; - - // Currently configured ROCm SMI events - RocmonSmiEvent* activeSmiEvents; - int numActiveSmiEvents; - - // Results for all events in all event sets - RocmonEventResultList* groupResults; - int numGroupResults; - - // Timestamps in ns - struct { - uint64_t start; - uint64_t read; - uint64_t stop; - } time; -} RocmonDevice; - -typedef struct { - // Event Groups - GroupInfo *groups; - int numGroups; // Number of allocated groups - int numActiveGroups; // Number of used groups - int activeGroup; // Currently active group - - // Devices (HSA agents) - RocmonDevice *devices; - int numDevices; - - // System information - long double hsa_timestamp_factor; // hsa_timestamp * hsa_timestamp_factor = timestamp_in_ns - - // ROCm SMI events - Map_t smiEvents; -} RocmonContext; - -extern RocmonContext *rocmon_context; - - -typedef struct { - bstring tag; - int groupID; - int gpuCount; - int eventCount; - double* time; - uint32_t* count; - int* gpulist; - double** counters; -} LikwidRocmResults; -#endif /* LIKWID_ROCMON_TYPES_H */ diff --git a/src/includes/rocmon_v1.h b/src/includes/rocmon_v1.h new file mode 100644 index 000000000..3fe05b0c7 --- /dev/null +++ b/src/includes/rocmon_v1.h @@ -0,0 +1,983 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_v1.h + * + * Description: Header File of rocmon module for ROCm < 6.2. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_V1_H +#define LIKWID_ROCMON_V1_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + + + +// #include +// #include +// #include + +// Variables +static void *rocmon_v1_dl_hsa_lib = NULL; +static void *rocmon_v1_dl_profiler_lib = NULL; + + +static bool rocmon_v1_initialized = FALSE; + +// Macros +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } +#endif + +#ifndef ROCM_CALL +#define ROCM_CALL( call, args, handleerror ) \ + do { \ + hsa_status_t _status = (*call##_ptr)args; \ + if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ + fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ + const char* err = NULL; \ + rocprofiler_error_string(&err); \ + if (err) fprintf(stderr, "Error: %s\n", err); \ + handleerror; \ + } \ + } while (0) +#endif + + +// ROCm function declarations +#ifndef DECLAREFUNC_HSA +#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; +#endif + +DECLAREFUNC_HSA(hsa_init, ()); +DECLAREFUNC_HSA(hsa_shut_down, ()); +DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); +DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); +DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); + +DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); +DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); +DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); +DECLAREFUNC_HSA(rocprofiler_error_string, ()); +DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); +DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); + + + +// ---------------------------------------------------- +// Rocmon helper functions +// ---------------------------------------------------- + +static int +_rocmon_v1_link_libraries() +{ + #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm V1 libraries); + + // Need to link in the ROCm HSA libraries + rocmon_v1_dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_hsa_lib) + { + ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); + return -1; + } + + // Need to link in the Rocprofiler libraries + rocmon_v1_dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_profiler_lib) + { + rocmon_v1_dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); + if (!rocmon_v1_dl_profiler_lib) + { + ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); + return -1; + } + } + + // Link HSA functions + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_init); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_shut_down); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_iterate_agents); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_agent_get_info); + DLSYM_AND_CHECK(rocmon_v1_dl_hsa_lib, hsa_system_get_info); + + // Link Rocprofiler functions + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_iterate_info); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_close); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_open); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_error_string); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_start); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_stop); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_read); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_get_data); + DLSYM_AND_CHECK(rocmon_v1_dl_profiler_lib, rocprofiler_get_metrics); + + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm V1 libraries done); + return 0; +} + +typedef struct { + RocmonContext* context; + int numGpus; + const int* gpuIds; +} iterate_agents_cb_arg; + +typedef struct { + RocmonDevice* device; + int currIndex; +} iterate_info_cb_arg; + + +static hsa_status_t +_rocmon_v1_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) +{ + RocmonDevice* device = (RocmonDevice*) data; + if (device) { + device->numRocMetrics++; + } + return HSA_STATUS_SUCCESS; +} + +static void +_rocmon_v1_print_rocprofiler_info_data(const rocprofiler_info_data_t info) +{ + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + return; + } + printf("Name '%s':\n", info.metric.name); + printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); + printf("\tInstances: %d\n", info.metric.instances); + printf("\tDescription: '%s'\n", info.metric.description); + printf("\tExpression: '%s'\n", info.metric.expr); + printf("\tBlockName: '%s'\n", info.metric.block_name); + printf("\tBlockCounters: %d\n", info.metric.block_counters); +} + +static hsa_status_t +_rocmon_v1_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) +{ + iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; + + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); +/* if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP)*/ +/* {*/ +/* _rocmon_v1_print_rocprofiler_info_data(info);*/ +/* }*/ + // Check info kind + if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + { + ERROR_PRINT(Wrong info kind %u, info.kind); + return HSA_STATUS_ERROR; + } + + // Check index + if (arg->currIndex >= arg->device->numRocMetrics) + { + ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); + return HSA_STATUS_ERROR; + } + + // Copy info data + rocprofiler_info_data_t* target_info = &arg->device->v1_rocMetrics[arg->currIndex]; + memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); + arg->currIndex++; + + return HSA_STATUS_SUCCESS; +} + + +static hsa_status_t +_rocmon_v1_iterate_agents_callback(hsa_agent_t agent, void* argv) +{ + // Count number of callback invocations as the devices id + static int nextDeviceId = 0; + int deviceId = nextDeviceId; + bool noAgent = false; + + iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; + + // Check if device is a GPU + hsa_device_type_t type; + ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); + if (type != HSA_DEVICE_TYPE_GPU) + { + return HSA_STATUS_SUCCESS; + } + nextDeviceId++; + + // Check if device is includes in arg->gpuIds + int gpuIndex = -1; + for (int i = 0; i < arg->numGpus; i++) + { + if (deviceId == arg->gpuIds[i]) + { + gpuIndex = i; + break; + } + } + if (gpuIndex < 0) + { + return HSA_STATUS_SUCCESS; + } + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); + + // Add agent to context + RocmonDevice *device = &arg->context->devices[gpuIndex]; + device->deviceId = deviceId; + device->hsa_agent = agent; + device->v1_context = NULL; + device->numActiveRocEvents = 0; + device->v1_activeRocEvents = NULL; + device->numGroupResults = 0; + device->groupResults = NULL; + + // Get number of available metrics + device->numRocMetrics = 0; + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); + + // workaround for bug in ROCm 5.4.0 + if(device->numRocMetrics == 0) { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_count, device), return HSA_STATUS_ERROR); + noAgent = true; + } + + // Allocate memory for metrics + device->v1_rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); + if (device->v1_rocMetrics == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate set of v1_rocMetrics); + return HSA_STATUS_ERROR; + } + + // Fetch metric informatino + iterate_info_cb_arg info_arg = { + .device = device, + .currIndex = 0, + }; + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); + + // If the call fails with agent, call rocprofiler_iterate_info without agent + if(noAgent) + { + ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } else { + ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_v1_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); + } + + return HSA_STATUS_SUCCESS; +} + + + + + +static int +_rocmon_v1_get_timestamp(uint64_t* timestamp_ns) +{ + uint64_t timestamp; + + // Get timestamp from system + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); + // Convert to nanoseconds + *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); + + return 0; +} + + +static int +_rocmon_v1_getLastResult(RocmonDevice* device, int eventId, double* value) +{ + rocprofiler_data_t* data = &device->v1_activeRocEvents[eventId].data; + + switch (data->kind) + { + case ROCPROFILER_DATA_KIND_INT32: + *value = (double) data->result_int32; + break; + case ROCPROFILER_DATA_KIND_INT64: + *value = (double) data->result_int64; + break; + case ROCPROFILER_DATA_KIND_FLOAT: + *value = (double) data->result_float; + break; + case ROCPROFILER_DATA_KIND_DOUBLE: + *value = data->result_double; + break; + + case ROCPROFILER_DATA_KIND_BYTES: + case ROCPROFILER_DATA_KIND_UNINIT: + default: + return -1; + } + + return 0; +} + + +static int +_rocmon_readCounters_rocprofiler_v1(RocmonDevice* device) +{ + int ret; + + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + if (!device->v1_context) + { + return 0; + } + + ROCM_CALL(rocprofiler_read, (device->v1_context, 0), return -1); + ROCM_CALL(rocprofiler_get_data, (device->v1_context, 0), return -1); + ROCM_CALL(rocprofiler_get_metrics, (device->v1_context), return -1); + + // Update results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + + // Read value + ret = _rocmon_v1_getLastResult(device, i, &result->fullValue); + if (ret < 0) + { + return -1; + } + + // Calculate delta since last read + result->lastValue = result->fullValue - result->lastValue; + } + + return 0; +} + + + +int +_rocmon_v1_readCounters(RocmonContext* context, uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) +{ + int ret; + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_v1_get_timestamp(×tamp)) + { + return ret; + } + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (!device->rocprof_v1) continue; + + // Save timestamp + if (getDestTimestampFunc) + { + uint64_t* timestampDest = getDestTimestampFunc(device); + if (timestampDest) + { + *timestampDest = timestamp; + } + } + + // Read rocprofiler counters + ret = _rocmon_readCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + } + + return 0; +} + + +static uint64_t* +_rocmon_v1_get_read_time(RocmonDevice* device) +{ + return &device->time.read; +} + + +static uint64_t* +_rocmon_v1_get_stop_time(RocmonDevice* device) +{ + return &device->time.stop; +} + + +int +rocmon_v1_init(RocmonContext* context, int numGpus, const int* gpuIds) +{ + hsa_status_t status = 0; + RocmonDevice* devices = NULL; + int num_devices = 0; + + // check if already initialized + if (rocmon_v1_initialized) + { + return 0; + } + if (context == NULL) + { + return -EEXIST; + } + + // Validate arguments + if (numGpus <= 0) + { + ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); + return -EINVAL; + } + + // Initialize other parts + init_configuration(); + + // initialize libraries + int ret = _rocmon_v1_link_libraries(); + if (ret < 0) + { + ERROR_PLAIN_PRINT(Failed to initialize libraries); + return ret; + } + + // init hsa library + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); + ROCM_CALL(hsa_init, (), + { + ERROR_PLAIN_PRINT(Failed to init hsa library); + goto rocmon_init_hsa_failed; + }); + + if (!context->devices) + { + context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); + if (!context->devices) + { + ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); + free(devices); + return -ENOMEM; + } + context->numDevices = numGpus; + } + // Get hsa timestamp factor + uint64_t frequency_hz; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); + ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), + { + ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); + goto rocmon_init_info_agents_failed; + }); + context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; + + // initialize structures for specified devices (fetch ROCm specific info) + iterate_agents_cb_arg arg = { + .context = context, + .numGpus = numGpus, + .gpuIds = gpuIds, + }; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); + ROCM_CALL(hsa_iterate_agents, (_rocmon_v1_iterate_agents_callback, &arg), + { + ERROR_PRINT(Error while iterating through available agents); + goto rocmon_init_info_agents_failed; + }); + + rocmon_v1_initialized = TRUE; + return 0; +rocmon_init_info_agents_failed: + ROCM_CALL(hsa_shut_down, (), { + // fall through + }); +rocmon_init_hsa_failed: + free(context->devices); + context->devices = NULL; + context->numDevices = 0; + return -1; +} + + +void +rocmon_v1_finalize(RocmonContext* context) +{ + + if (!rocmon_v1_initialized) + { + return; + } + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); + + if (context) + { + if (context->devices) + { + // Free each devices fields + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + if (device->rocprof_v1) + { + FREE_IF_NOT_NULL(device->v1_rocMetrics); + FREE_IF_NOT_NULL(device->v1_activeRocEvents); + } + if (device->v1_context) + { + ROCM_CALL(rocprofiler_close, (device->v1_context),); + } + } + } + } + + ROCM_CALL(hsa_shut_down, (), { + ERROR_PRINT(Shutdown HSA failed); + // fall through + }); +} + + +/*int*/ +/*rocmon_v1_addEventSet(const char* eventString, int* gid)*/ +/*{*/ +/* // Check arguments*/ +/* if (!eventString)*/ +/* {*/ +/* return -EINVAL;*/ +/* }*/ +/* */ +/* // Ensure rocmon is initialized*/ +/* if (!rocmon_v1_initialized)*/ +/* {*/ +/* return -EFAULT;*/ +/* }*/ + +/* // Allocate memory for event group if necessary*/ +/* if (rocmon_context->numActiveGroups == rocmon_context->numGroups)*/ +/* {*/ +/* GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo));*/ +/* if (tmpInfo == NULL)*/ +/* {*/ +/* ERROR_PLAIN_PRINT(Cannot allocate additional group);*/ +/* return -ENOMEM;*/ +/* }*/ +/* rocmon_context->groups = tmpInfo;*/ +/* rocmon_context->numGroups++;*/ +/* }*/ + +/* // Parse event string*/ +/* int err = _rocmon_v1_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]);*/ +/* if (err < 0)*/ +/* {*/ +/* return err;*/ +/* }*/ + +/* */ + +/* *gid = rocmon_context->numActiveGroups;*/ +/* rocmon_context->numActiveGroups++;*/ +/* return 0;*/ +/*}*/ + + +int +_rocmon_setupCounters_rocprofiler_v1(RocmonDevice* device, const char** events, int numEvents) +{ + // Close previous rocprofiler context + if (device->v1_context) + { + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); + ROCM_CALL(rocprofiler_close, (device->v1_context), return -1); + } + + // Look if the are any events + if (numEvents <= 0) + { + return 0; + } + + // Create feature array to monitor + rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); + if (features == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate feature list); + return -ENOMEM; + } + for (int i = 0; i < numEvents; i++) + { + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = events[i]; + //ROCMON_DEBUG_PRINT(DEBUGLEV_DEBUG, Setup ROCMON rocprofiler_v1 counter %d %s, i, events[i]); + } + + // Free previous feature array if present + FREE_IF_NOT_NULL(device->v1_activeRocEvents); + + device->numActiveRocEvents = numEvents; + device->v1_activeRocEvents = features; + + // Open context + rocprofiler_properties_t properties = {}; + properties.queue_depth = 128; + uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; + + // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. + ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->v1_activeRocEvents, device->numActiveRocEvents, &device->v1_context, mode, &properties), return -1); + + return 0; +} + + +int +rocmon_v1_setupCounters(RocmonContext* context, int gid) +{ + int ret; + + // Check arguments + if (gid < 0 || gid >= context->numActiveGroups) + { + return -EINVAL; + } + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Get group info + GroupInfo* group = &context->groups[gid]; + + // + // Separate rocprofiler and SMI events + // + const char **rocEvents = NULL; + int numRocEvents = 0; + + // Allocate memory for string arrays + rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); + if (rocEvents == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); + return -ENOMEM; + } + + // Go through each event and sort it + for (int i = 0; i < group->nevents; i++) + { + const char* name = group->events[i]; + if (strncmp(name, "ROCP_", 5) == 0) + { + // Rocprofiler event + rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix + numRocEvents++; + } + } + + // Add events to each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Add rocprofiler events + //ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); + ret = _rocmon_setupCounters_rocprofiler_v1(device, rocEvents, numRocEvents); + if (ret < 0) + { + free(rocEvents); + return ret; + } + } + // Cleanup + free(rocEvents); + + return 0; +} + + +static int +_rocmon_startCounters_rocprofiler_v1(RocmonDevice* device) +{ + // Check if there are any counters to start + if (device->numActiveRocEvents <= 0) + { + return 0; + } + + // Reset results + RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; + for (int i = 0; i < device->numActiveRocEvents; i++) + { + RocmonEventResult* result = &groupResult->results[i]; + result->lastValue = 0; + result->fullValue = 0; + } + + if (device->v1_context) + { + ROCM_CALL(rocprofiler_start, (device->v1_context, 0), return -1); + } + + return 0; +} + + + +int +rocmon_v1_startCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Get timestamp + uint64_t timestamp; + if (ret = _rocmon_v1_get_timestamp(×tamp)) + { + return ret; + } + + // Start counters on each device + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + device->time.start = timestamp; + device->time.read = timestamp; + + // Start rocprofiler events + ret = _rocmon_startCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + + // Start SMI events +/* _rocmon_startCounters_smi(device);*/ +/* if (ret < 0) return ret;*/ + } + + return 0; +} + + +static int +_rocmon_stopCounters_rocprofiler_v1(RocmonDevice* device) +{ + if (device->v1_context) + { + // Close context + ROCM_CALL(rocprofiler_stop, (device->v1_context, 0), return -1); + } + + return 0; +} + + +int +rocmon_v1_stopCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_v1_readCounters(context, &_rocmon_v1_get_stop_time); + if (ret < 0) return ret; + + for (int i = 0; i < context->numDevices; i++) + { + RocmonDevice* device = &context->devices[i]; + + // Stop rocprofiler events + ret = _rocmon_stopCounters_rocprofiler_v1(device); + if (ret < 0) return ret; + + // Nothing to stop for SMI events + } + + return 0; +} + + +int +rocmon_v1_readCounters(RocmonContext* context) +{ + int ret; + + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + + // Read counters + ret = _rocmon_v1_readCounters(context, &_rocmon_v1_get_read_time); + if (ret < 0) return ret; + + return 0; +} + + +int +rocmon_v1_getEventsOfGpu(RocmonContext* context, int gpuIdx, EventList_rocm_t* list) +{ + EventList_rocm_t tmpList = NULL; + Event_rocm_t* tmpEventList = NULL; + // Ensure rocmon is initialized + if (!rocmon_v1_initialized) + { + return -EFAULT; + } + // Validate args + if ((gpuIdx < 0) || (gpuIdx > context->numDevices) || (!list)) + { + return -EINVAL; + } + + RocmonDevice* device = &context->devices[gpuIdx]; + + if (*list) + { + tmpList = *list; + } + else + { + // Allocate list structure + EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); + if (tmpList == NULL) + { + ERROR_PLAIN_PRINT(Cannot allocate event list); + return -ENOMEM; + } + memset(tmpList, 0, sizeof(EventList_rocm)); + } + + // Get number of events + printf("Number of events %d\n", device->numRocMetrics); + + if (device->numRocMetrics == 0) + { + // No events -> return list + *list = tmpList; + return 0; + } + // (Re-)Allocate event array + tmpEventList = realloc(tmpList->events, (tmpList->numEvents + device->numRocMetrics) * sizeof(Event_rocm_t)); + if (!tmpEventList) + { + if (!*list) free(tmpList); + ERROR_PLAIN_PRINT(Cannot allocate events for event list); + return -ENOMEM; + } + tmpList->events = tmpEventList; + int startindex = tmpList->numEvents; + + // Copy rocprofiler event information + for (int i = 0; i < device->numRocMetrics; i++) + { + rocprofiler_info_data_t* event = &device->v1_rocMetrics[i]; + Event_rocm_t* out = &tmpList->events[startindex + i]; + int len; + + // Copy name + printf("Name %s\n", event->metric.name); + len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; + out->name = (char*) malloc(len); + if (out->name) + { + snprintf(out->name, len, "ROCP_%s", event->metric.name); + } + + // Copy description + len = strlen(event->metric.description) + 1 /* NULL byte */; + out->description = (char*) malloc(len); + if (out->description) + { + snprintf(out->description, len, "%s", event->metric.description); + } + tmpList->numEvents++; + } + *list = tmpList; + return 0; +} + + +int +rocmon_v1_switchActiveGroup(RocmonContext* context, int newGroupId) +{ + int ret; + + ret = rocmon_v1_stopCounters(context); + if (ret < 0) + { + return ret; + } + + ret = rocmon_v1_setupCounters(context, newGroupId); + if (ret < 0) + { + return ret; + } + + ret = rocmon_v1_startCounters(context); + if (ret < 0) + { + return ret; + } + + return 0; +} + + + +#endif /* LIKWID_ROCMON_V1_H */ + diff --git a/src/includes/rocmon_v1_types.h b/src/includes/rocmon_v1_types.h new file mode 100644 index 000000000..22d588a90 --- /dev/null +++ b/src/includes/rocmon_v1_types.h @@ -0,0 +1,59 @@ +/* + * ======================================================================================= + * + * Filename: rocmon_v1_types.h + * + * Description: Header File of rocmon v1 module. + * + * Version: + * Released: + * + * Author: Thomas Gruber (tg), thomas.gruber@googlemail.com + * Project: likwid + * + * Copyright (C) 2019 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_ROCMON_V1_TYPES_H +#define LIKWID_ROCMON_V1_TYPES_H + +#include +// #include +#ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#endif +#ifdef ROCPROFILER_EXPORT +#undef ROCPROFILER_EXPORT +#endif +#ifdef ROCPROFILER_IMPORT +#undef ROCPROFILER_IMPORT +#endif +#ifdef ROCPROFILER_VERSION_MAJOR +#undef ROCPROFILER_VERSION_MAJOR +#endif +#ifdef ROCPROFILER_VERSION_MINOR +#undef ROCPROFILER_VERSION_MINOR +#endif +#ifdef ROCPROFILER_API +#undef ROCPROFILER_API +#endif +#include + + +#include + + +#endif /* LIKWID_ROCMON_V1_TYPES_H */ diff --git a/src/includes/types.h b/src/includes/types.h index 1c45306e8..1461fcb14 100644 --- a/src/includes/types.h +++ b/src/includes/types.h @@ -87,4 +87,11 @@ typedef struct { #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof((arr)[0])) + +/*#if __STDC_VERSION__ <= 199901L +typedef int bool; +#else +#include +#endif*/ + #endif /*TYPES_H*/ diff --git a/src/perfmon.c b/src/perfmon.c index 2322b50a5..129fcd71b 100644 --- a/src/perfmon.c +++ b/src/perfmon.c @@ -771,7 +771,7 @@ perfmon_check_counter_map(int cpu_id) HPMinit(); if (HPMaddThread(cpu_id) != 0) { - ERROR_PLAIN_PRINT(Cannot check counters without access to performance counters) + ERROR_PLAIN_PRINT(Cannot check counters without access to performance counters); return; } own_hpm = 1; diff --git a/src/power.c b/src/power.c index b223925fa..c8e37eda6 100644 --- a/src/power.c +++ b/src/power.c @@ -249,7 +249,7 @@ power_init(int cpuId) err = HPMaddThread(cpuId); if (err != 0) { - ERROR_PRINT(Cannot get access to RAPL counters) + ERROR_PRINT(Cannot get access to RAPL counters); return err; } } diff --git a/src/rocmon.c b/src/rocmon.c index ba7bdf85b..f767b29dc 100644 --- a/src/rocmon.c +++ b/src/rocmon.c @@ -44,1133 +44,122 @@ #include #include -#include -#include -#include -#if AMDSMI_LIB_VERSION_YEAR == 23 && AMDSMI_LIB_VERSION_MAJOR == 4 && AMDSMI_LIB_VERSION_MINOR == 0 && AMDSMI_LIB_VERSION_RELEASE == 0 -typedef struct metrics_table_header_t metrics_table_header_t; +#include +#ifdef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE +#undef HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE #endif -#include - -// #include -// #include -// #include - -// Variables -static void *dl_hsa_lib = NULL; -static void *dl_profiler_lib = NULL; -static void *dl_rsmi_lib = NULL; -RocmonContext *rocmon_context = NULL; -static bool rocmon_initialized = FALSE; +#include int likwid_rocmon_verbosity = DEBUGLEV_ONLY_ERROR; +static int rocmon_initialized = 0; +static RocmonContext* rocmon_context = NULL; + +// Include backends +#include +#include +#ifdef LIKWID_ROCPROF_SDK +#include +#include +#endif +#include +#include -// Macros -#define membersize(type, member) sizeof(((type *) NULL)->member) -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } -#define ROCM_CALL( call, args, handleerror ) \ - do { \ - hsa_status_t _status = (*call##_ptr)args; \ - if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ - const char* err = NULL; \ - fprintf(stderr, "Error: function %s failed with error %d\n", #call, _status); \ - rocprofiler_error_string(&err); \ - fprintf(stderr, "Error: %s\n", err); \ - handleerror; \ - } \ - } while (0) - -#define RSMI_CALL( call, args, handleerror ) \ - do { \ - rsmi_status_t _status = (*call##_ptr)args; \ - if (_status != RSMI_STATUS_SUCCESS) { \ - fprintf(stderr, "Error: function %s failed with error %d.\n", #call, _status); \ - handleerror; \ - } \ - } while (0) - -// ROCm function declarations -#define ROCMWEAK __attribute__(( weak )) -#define DECLAREFUNC_HSA(funcname, funcsig) hsa_status_t ROCMWEAK funcname funcsig; hsa_status_t ( *funcname##_ptr ) funcsig; -#define DECLAREFUNC_SMI(funcname, funcsig) rsmi_status_t ROCMWEAK funcname funcsig; rsmi_status_t ( *funcname##_ptr ) funcsig; - -DECLAREFUNC_HSA(hsa_init, ()); -DECLAREFUNC_HSA(hsa_shut_down, ()); -DECLAREFUNC_HSA(hsa_iterate_agents, (hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data)); -DECLAREFUNC_HSA(hsa_agent_get_info, (hsa_agent_t agent, hsa_agent_info_t attribute, void* value)); -DECLAREFUNC_HSA(hsa_system_get_info, (hsa_system_info_t attribute, void *value)); - -DECLAREFUNC_HSA(rocprofiler_iterate_info, (const hsa_agent_t* agent, rocprofiler_info_kind_t kind, hsa_status_t (*callback)(const rocprofiler_info_data_t, void* data), void* data)); -DECLAREFUNC_HSA(rocprofiler_close, (rocprofiler_t* context)); -DECLAREFUNC_HSA(rocprofiler_open, (hsa_agent_t agent, rocprofiler_feature_t* features, uint32_t feature_count, rocprofiler_t** context, uint32_t mode, rocprofiler_properties_t* properties)); -DECLAREFUNC_HSA(rocprofiler_error_string, ()); -DECLAREFUNC_HSA(rocprofiler_start, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_stop, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_read, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_data, (rocprofiler_t* context, uint32_t group_index)); -DECLAREFUNC_HSA(rocprofiler_get_metrics, (const rocprofiler_t* context)); - -DECLAREFUNC_SMI(rsmi_init, (uint64_t flags)); -DECLAREFUNC_SMI(rsmi_shut_down, ()); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_open, (uint32_t dv_ind, rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_variant_iterator_open, (rsmi_func_id_iter_handle_t obj_h, rsmi_func_id_iter_handle_t* var_iter)); -DECLAREFUNC_SMI(rsmi_func_iter_value_get, (rsmi_func_id_iter_handle_t handle, rsmi_func_id_value_t* value )); -DECLAREFUNC_SMI(rsmi_func_iter_next, (rsmi_func_id_iter_handle_t handle)); -DECLAREFUNC_SMI(rsmi_dev_supported_func_iterator_close, (rsmi_func_id_iter_handle_t* handle)); -DECLAREFUNC_SMI(rsmi_dev_power_ave_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* power)); -DECLAREFUNC_SMI(rsmi_dev_pci_throughput_get, (uint32_t dv_ind, uint64_t* sent, uint64_t* received, uint64_t* max_pkt_sz)); -DECLAREFUNC_SMI(rsmi_dev_pci_replay_counter_get, (uint32_t dv_ind, uint64_t* counter)); -DECLAREFUNC_SMI(rsmi_dev_memory_total_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* total)); -DECLAREFUNC_SMI(rsmi_dev_memory_usage_get, (uint32_t dv_ind, rsmi_memory_type_t mem_type, uint64_t* used )); -DECLAREFUNC_SMI(rsmi_dev_memory_busy_percent_get, (uint32_t dv_ind, uint32_t* busy_percent)); -DECLAREFUNC_SMI(rsmi_dev_memory_reserved_pages_get, (uint32_t dv_ind, uint32_t* num_pages, rsmi_retired_page_record_t* records)); -DECLAREFUNC_SMI(rsmi_dev_fan_rpms_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_get, (uint32_t dv_ind, uint32_t sensor_ind, int64_t* speed)); -DECLAREFUNC_SMI(rsmi_dev_fan_speed_max_get, (uint32_t dv_ind, uint32_t sensor_ind, uint64_t* max_speed)); -DECLAREFUNC_SMI(rsmi_dev_temp_metric_get, (uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t* temperature)); -DECLAREFUNC_SMI(rsmi_dev_volt_metric_get, (uint32_t dv_ind, rsmi_voltage_type_t sensor_type, rsmi_voltage_metric_t metric, int64_t* voltage)); -DECLAREFUNC_SMI(rsmi_dev_overdrive_level_get, (uint32_t dv_ind, uint32_t* od)); -DECLAREFUNC_SMI(rsmi_dev_ecc_count_get, (uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t* ec)); -DECLAREFUNC_SMI(rsmi_compute_process_info_get, (rsmi_process_info_t* procs, uint32_t* num_items)); - - -// ---------------------------------------------------- -// SMI event wrapper -// ---------------------------------------------------- - -static int -_smi_wrapper_pci_throughput_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t value; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _smi_wrapper_pci_throughput_get(%d, %d), deviceId, event->extra); - // Internal variant: 0 for sent, 1 for received bytes and 2 for max packet size - if (event->extra == 0) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, &value, NULL, NULL), return -1); - else if (event->extra == 1) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, &value, NULL), return -1); - else if (event->extra == 2) RSMI_CALL(rsmi_dev_pci_throughput_get, (deviceId, NULL, NULL, &value), return -1); - else return -1; - - result->fullValue += value; - result->lastValue = value; - - return 0; -} - - -static int -_smi_wrapper_pci_replay_counter_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t counter; - RSMI_CALL(rsmi_dev_pci_replay_counter_get, (deviceId, &counter), return -1); - result->fullValue += counter; - result->lastValue = counter; - - return 0; -} - - -static int -_smi_wrapper_power_ave_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t power; - RSMI_CALL(rsmi_dev_power_ave_get, (deviceId, event->subvariant, &power), return -1); - result->fullValue += power; - result->lastValue = power; - - return 0; -} - - -static int -_smi_wrapper_memory_total_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t total; - RSMI_CALL(rsmi_dev_memory_total_get, (deviceId, event->variant, &total), return -1); - result->fullValue += total; - result->lastValue = total; - - return 0; -} - - -static int -_smi_wrapper_memory_usage_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint64_t used; - RSMI_CALL(rsmi_dev_memory_usage_get, (deviceId, event->variant, &used), return -1); - result->fullValue += used; - result->lastValue = used; - - return 0; -} - - -static int -_smi_wrapper_memory_busy_percent_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t percent; - RSMI_CALL(rsmi_dev_memory_busy_percent_get, (deviceId, &percent), return -1); - result->fullValue += percent; - result->lastValue = percent; - - return 0; -} - - -static int -_smi_wrapper_memory_reserved_pages_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_pages; - RSMI_CALL(rsmi_dev_memory_reserved_pages_get, (deviceId, &num_pages, NULL), return -1); - result->fullValue += num_pages; - result->lastValue = num_pages; - - return 0; -} - - -static int -_smi_wrapper_fan_rpms_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_rpms_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t speed; - RSMI_CALL(rsmi_dev_fan_speed_get, (deviceId, event->subvariant, &speed), return -1); - result->fullValue += speed; - result->lastValue = speed; - - return 0; -} - - -static int -_smi_wrapper_fan_speed_max_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t max_speed; - RSMI_CALL(rsmi_dev_fan_speed_max_get, (deviceId, event->subvariant, &max_speed), return -1); - result->fullValue += max_speed; - result->lastValue = max_speed; - - return 0; -} - - -static int -_smi_wrapper_temp_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t temperature; - RSMI_CALL(rsmi_dev_temp_metric_get, (deviceId, event->subvariant, event->variant, &temperature), return -1); - result->fullValue += temperature; - result->lastValue = temperature; - - return 0; -} - - -static int -_smi_wrapper_volt_metric_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - int64_t voltage; - RSMI_CALL(rsmi_dev_volt_metric_get, (deviceId, event->subvariant, event->variant, &voltage), return -1); - result->fullValue += voltage; - result->lastValue = voltage; - - return 0; -} - - -static int -_smi_wrapper_overdrive_level_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t overdrive; - RSMI_CALL(rsmi_dev_overdrive_level_get, (deviceId, &overdrive), return -1); - result->fullValue += overdrive; - result->lastValue = overdrive; - - return 0; -} - - -static int -_smi_wrapper_ecc_count_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - rsmi_error_count_t error_count; - RSMI_CALL(rsmi_dev_ecc_count_get, (deviceId, event->variant, &error_count), return -1); - - if (event->extra == 0) - { - result->lastValue = error_count.correctable_err - result->fullValue; - result->fullValue = error_count.correctable_err; - } - else if (event->extra == 1) - { - result->lastValue = error_count.uncorrectable_err - result->fullValue; - result->fullValue = error_count.uncorrectable_err; - } - else - { - return -1; - } - - return 0; -} - - -static int -_smi_wrapper_compute_process_info_get(int deviceId, RocmonSmiEvent* event, RocmonEventResult* result) -{ - uint32_t num_items; - RSMI_CALL(rsmi_compute_process_info_get, (NULL, &num_items), return -1); - result->fullValue += num_items; - result->lastValue = num_items; - - return 0; -} - - -// ---------------------------------------------------- -// Rocmon helper functions -// ---------------------------------------------------- - -static int -_rocmon_link_libraries() -{ - #define DLSYM_AND_CHECK( dllib, name ) name##_ptr = dlsym( dllib, #name ); if ( dlerror() != NULL ) { ERROR_PRINT(Failed to link #name); return -1; } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries); - - // Need to link in the ROCm HSA libraries - dl_hsa_lib = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_hsa_lib) - { - ERROR_PRINT(ROCm HSA library libhsa-runtime64.so not found: %s, dlerror()); - return -1; - } - - // Need to link in the Rocprofiler libraries - dl_profiler_lib = dlopen("librocprofiler64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - dl_profiler_lib = dlopen("librocprofiler64.so.1", RTLD_NOW | RTLD_GLOBAL); - if (!dl_profiler_lib) - { - ERROR_PRINT(Rocprofiler library librocprofiler64.so not found: %s, dlerror()); - return -1; - } - } - - // Need to link in the Rocprofiler libraries - dl_rsmi_lib = dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL); - if (!dl_rsmi_lib) - { - ERROR_PRINT(ROCm SMI library librocm_smi64.so not found: %s, dlerror()); - return -1; - } - - // Link HSA functions - DLSYM_AND_CHECK(dl_hsa_lib, hsa_init); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_shut_down); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_iterate_agents); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_agent_get_info); - DLSYM_AND_CHECK(dl_hsa_lib, hsa_system_get_info); - - // Link Rocprofiler functions - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_iterate_info); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_close); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_open); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_error_string); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_start); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_stop); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_read); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_data); - DLSYM_AND_CHECK(dl_profiler_lib, rocprofiler_get_metrics); - - // Link SMI functions - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_init); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_shut_down); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_variant_iterator_open); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_value_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_func_iter_next); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_supported_func_iterator_close); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_power_ave_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_throughput_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_pci_replay_counter_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_total_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_usage_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_busy_percent_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_memory_reserved_pages_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_rpms_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_fan_speed_max_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_temp_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_volt_metric_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_overdrive_level_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_dev_ecc_count_get); - DLSYM_AND_CHECK(dl_rsmi_lib, rsmi_compute_process_info_get); - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Linking AMD ROCMm libraries done); - return 0; -} - -typedef struct { - RocmonContext* context; - int numGpus; - const int* gpuIds; -} iterate_agents_cb_arg; - -typedef struct { - RocmonDevice* device; - int currIndex; -} iterate_info_cb_arg; +//#include +const char* rocprofiler_group_arch = "amd_gpu"; -static hsa_status_t -_rocmon_iterate_info_callback_count(const rocprofiler_info_data_t info, void* data) -{ - RocmonDevice* device = (RocmonDevice*) data; - if (device) { - device->numRocMetrics++; - } - return HSA_STATUS_SUCCESS; -} - -static void -_rocmon_print_rocprofiler_info_data(const rocprofiler_info_data_t info) +void +rocmon_finalize(void) { - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) + if ((!rocmon_initialized) || (rocmon_context == NULL)) { + rocmon_context = NULL; + rocmon_initialized = 0; return; } - printf("Name '%s':\n", info.metric.name); - printf("\tKind: '%s'\n", (info.kind == ROCPROFILER_INFO_KIND_METRIC ? "Metric" : "Trace")); - printf("\tInstances: %d\n", info.metric.instances); - printf("\tDescription: '%s'\n", info.metric.description); - printf("\tExpression: '%s'\n", info.metric.expr); - printf("\tBlockName: '%s'\n", info.metric.block_name); - printf("\tBlockCounters: %d\n", info.metric.block_counters); -} - -static hsa_status_t -_rocmon_iterate_info_callback_add(const rocprofiler_info_data_t info, void* data) -{ - iterate_info_cb_arg* arg = (iterate_info_cb_arg*) data; - - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, _rocmon_iterate_info_callback_add); - if (likwid_rocmon_verbosity == DEBUGLEV_DEVELOP) - { - _rocmon_print_rocprofiler_info_data(info); - } - // Check info kind - if (info.kind != ROCPROFILER_INFO_KIND_METRIC) - { - ERROR_PRINT(Wrong info kind %u, info.kind); - return HSA_STATUS_ERROR; - } - - // Check index - if (arg->currIndex >= arg->device->numRocMetrics) - { - ERROR_PRINT(Metric index out of bounds: %d, arg->currIndex); - return HSA_STATUS_ERROR; - } - - // Copy info data - rocprofiler_info_data_t* target_info = &arg->device->rocMetrics[arg->currIndex]; - memcpy(target_info, &info, sizeof(rocprofiler_info_data_t)); - arg->currIndex++; - - return HSA_STATUS_SUCCESS; -} - - -static hsa_status_t -_rocmon_iterate_agents_callback(hsa_agent_t agent, void* argv) -{ - // Count number of callback invocations as the devices id - static int nextDeviceId = 0; - int deviceId = nextDeviceId; - bool noAgent = false; - - iterate_agents_cb_arg *arg = (iterate_agents_cb_arg*) argv; - - // Check if device is a GPU - hsa_device_type_t type; - ROCM_CALL(hsa_agent_get_info, (agent, HSA_AGENT_INFO_DEVICE, &type), return -1); - if (type != HSA_DEVICE_TYPE_GPU) - { - return HSA_STATUS_SUCCESS; - } - nextDeviceId++; - - // Check if device is includes in arg->gpuIds - int gpuIndex = -1; - for (int i = 0; i < arg->numGpus; i++) - { - if (deviceId == arg->gpuIds[i]) - { - gpuIndex = i; - break; - } - } - if (gpuIndex < 0) + if (rocmon_context->use_rocprofiler_v1) { - return HSA_STATUS_SUCCESS; - } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing agent %d, gpuIndex); - - // Add agent to context - RocmonDevice *device = &arg->context->devices[gpuIndex]; - device->deviceId = deviceId; - device->hsa_agent = agent; - device->context = NULL; - device->numActiveRocEvents = 0; - device->activeRocEvents = NULL; - device->numGroupResults = 0; - device->groupResults = NULL; - - // Get number of available metrics - device->numRocMetrics = 0; - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, RocProfiler provides %d events, device->numRocMetrics); - - // workaround for bug in ROCm 5.4.0 - if(device->numRocMetrics == 0) { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_count, device), return HSA_STATUS_ERROR); - noAgent = true; - } - - // Allocate memory for metrics - device->rocMetrics = (rocprofiler_info_data_t*) malloc(device->numRocMetrics * sizeof(rocprofiler_info_data_t)); - if (device->rocMetrics == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate set of rocMetrics); - return HSA_STATUS_ERROR; - } - - // Initialize SMI events map - if (init_map(&device->smiMetrics, MAP_KEY_TYPE_STR, 0, &free) < 0) - { - ERROR_PLAIN_PRINT(Cannot init smiMetrics map); - return HSA_STATUS_ERROR; - } - - // Fetch metric informatino - iterate_info_cb_arg info_arg = { - .device = device, - .currIndex = 0, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, Read %d RocProfiler events for device %d, device->numRocMetrics, device->deviceId); - - // If the call fails with agent, call rocprofiler_iterate_info without agent - if(noAgent) - { - ROCM_CALL(rocprofiler_iterate_info, (NULL, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } else { - ROCM_CALL(rocprofiler_iterate_info, (&agent, ROCPROFILER_INFO_KIND_METRIC, _rocmon_iterate_info_callback_add, &info_arg), return HSA_STATUS_ERROR); - } - - return HSA_STATUS_SUCCESS; -} - - -static int -_rocmon_parse_eventstring(const char* eventString, GroupInfo* group) -{ - int err = 0; - Configuration_t config = get_configuration(); - bstring eventBString = bfromcstr(eventString); - - if (bstrchrp(eventBString, ':', 0) != BSTR_ERR) - { - // If custom group -> perfgroup_customGroup - err = perfgroup_customGroup(eventString, group); - if (err < 0) - { - ERROR_PRINT(Cannot transform %s to performance group, eventString); - return err; - } + rocmon_v1_finalize(rocmon_context); } +#ifdef LIKWID_ROCPROF_SDK else { - // If performance group -> perfgroup_readGroup - err = perfgroup_readGroup(config->groupPath, "amd_gpu", eventString, group); - if (err == -EACCES) - { - ERROR_PRINT(Access to performance group %s not allowed, eventString); - return err; - } - else if (err == -ENODEV) - { - ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); - return err; - } - if (err < 0) - { - ERROR_PRINT(Cannot read performance group %s, eventString); - return err; - } - } - - return 0; -} - - -static int -_rocmon_get_timestamp(uint64_t* timestamp_ns) -{ - uint64_t timestamp; - - // Get timestamp from system - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP, ×tamp), return -1); - // Convert to nanoseconds - *timestamp_ns = (uint64_t)((long double)timestamp * rocmon_context->hsa_timestamp_factor); - - return 0; -} - - -static int -_rocmon_getLastResult(RocmonDevice* device, int eventId, double* value) -{ - rocprofiler_data_t* data = &device->activeRocEvents[eventId].data; - - switch (data->kind) - { - case ROCPROFILER_DATA_KIND_INT32: - *value = (double) data->result_int32; - break; - case ROCPROFILER_DATA_KIND_INT64: - *value = (double) data->result_int64; - break; - case ROCPROFILER_DATA_KIND_FLOAT: - *value = (double) data->result_float; - break; - case ROCPROFILER_DATA_KIND_DOUBLE: - *value = data->result_double; - break; - - case ROCPROFILER_DATA_KIND_BYTES: - case ROCPROFILER_DATA_KIND_UNINIT: - default: - return -1; - } - - return 0; -} - - -static int -_rocmon_readCounters_rocprofiler(RocmonDevice* device) -{ - int ret; - - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) - { - return 0; - } - - if (!device->context) - { - return 0; - } - - ROCM_CALL(rocprofiler_read, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_data, (device->context, 0), return -1); - ROCM_CALL(rocprofiler_get_metrics, (device->context), return -1); - - // Update results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) - { - RocmonEventResult* result = &groupResult->results[i]; - - // Read value - ret = _rocmon_getLastResult(device, i, &result->fullValue); - if (ret < 0) - { - return -1; - } - - // Calculate delta since last read - result->lastValue = result->fullValue - result->lastValue; - } - - return 0; -} - - -static int -_rocmon_readCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) - { - return 0; + rocmon_sdk_finalize(rocmon_context); } +#endif - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) - { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - } - - return 0; -} - - -static int -_rocmon_readCounters(uint64_t* (*getDestTimestampFunc)(RocmonDevice* device)) -{ - int ret; - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) - { - return ret; - } + rocmon_smi_finalize(rocmon_context); - for (int i = 0; i < rocmon_context->numDevices; i++) + if (rocmon_context->devices) { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Save timestamp - if (getDestTimestampFunc) + for (int i = 0; i < rocmon_context->numDevices; i++) { - uint64_t* timestampDest = getDestTimestampFunc(device); - if (timestampDest) + RocmonDevice* dev = &rocmon_context->devices[i]; + if (dev->groupResults) { - *timestampDest = timestamp; - } - } - - // Read rocprofiler counters - ret = _rocmon_readCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Read SMI counters - ret = _rocmon_readCounters_smi(device); - if (ret < 0) return ret; - } - - return 0; -} - - -static uint64_t* -_rocmon_get_read_time(RocmonDevice* device) -{ - return &device->time.read; -} - - -static uint64_t* -_rocmon_get_stop_time(RocmonDevice* device) -{ - return &device->time.stop; -} - - -// ---------------------------------------------------- -// Rocmon SMI helper functions -// ---------------------------------------------------- - -static bstring -_rocmon_smi_build_label(RocmonSmiEventType type, const char* funcname, uint64_t variant, uint64_t subvariant) -{ - switch (type) - { - case ROCMON_SMI_EVENT_TYPE_NORMAL: - return bfromcstr(funcname); - case ROCMON_SMI_EVENT_TYPE_VARIANT: - return bformat("%s|%" PRIu64, funcname, variant); - case ROCMON_SMI_EVENT_TYPE_SUBVARIANT: - return bformat("%s|%" PRIu64 "|%" PRIu64, funcname, variant, subvariant); - case ROCMON_SMI_EVENT_TYPE_INSTANCES: - return bfromcstr(funcname); - } -} - - -static int -_rocmon_smi_add_event_to_device(RocmonDevice* device, const char* funcname, RocmonSmiEventType type, int64_t variant, uint64_t subvariant) -{ - int ret; - - // Get event by label - RocmonSmiEventList* list = NULL; - bstring label = _rocmon_smi_build_label(type, funcname, variant, subvariant); - ret = get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list); - bdestroy(label); - if (ret < 0) - { - // Event not registered -> ignore - return 0; - } - - // For events with multiple sensor, only make one entry -> find if one exists - if (type == ROCMON_SMI_EVENT_TYPE_INSTANCES && subvariant > 0) - { - // Get list from map - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - RocmonSmiEvent* existingEvent = NULL; - ret = get_smap_by_key(device->smiMetrics, event->name, (void**)&existingEvent); - if (ret < 0) - { - ERROR_PRINT(Failed to find previous instance for event %s, event->name); - return -1; + for (int j = 0; j < dev->numGroupResults; j++) + { + RocmonEventResultList* l = &dev->groupResults[j]; + if (l->results) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy group result %d for device %d, j, dev->deviceId); + free(l->results); + l->results = NULL; + l->numResults = 0; + } + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy group results for device %d, dev->deviceId); + free(dev->groupResults); + dev->groupResults = NULL; } - - // Update instance information - existingEvent->instances++; } - return 0; - } - - for (int i = 0; i < list->numEntries; i++) - { - RocmonSmiEvent* event = &list->entries[i]; - - // Allocate memory for device event description - RocmonSmiEvent* tmpEvent = (RocmonSmiEvent*) malloc(sizeof(RocmonSmiEvent)); - if (tmpEvent == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event in device list %s, event->name); - return -ENOMEM; - } - - // Copy information from global description - memcpy(tmpEvent, event, sizeof(RocmonSmiEvent)); - tmpEvent->variant = variant; - tmpEvent->subvariant = subvariant; - tmpEvent->instances = 1; - - // Save event info to device event map - add_smap(device->smiMetrics, tmpEvent->name, tmpEvent); - } - - return 0; -} - - -static int -_rocmon_smi_get_function_subvariants(RocmonDevice* device, const char* funcname, uint64_t variant, rsmi_func_id_iter_handle_t var_iter) -{ - rsmi_func_id_iter_handle_t sub_var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open subvariants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(var_iter, &sub_var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No subvariants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_VARIANT, variant, 0); - if (ret < 0) return -1; - return 0; - } - - // Subvariants available -> iterate them - do { - // Get subvariant information - (*rsmi_func_iter_value_get_ptr)(sub_var_iter, &value); - - // Process info - if (variant == RSMI_DEFAULT_VARIANT) - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_INSTANCES, variant, value.id); - else - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, variant, value.id); - if (ret < 0) return ret; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(sub_var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&sub_var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_function_variants(RocmonDevice* device, const char* funcname, rsmi_func_id_iter_handle_t iter_handle) -{ - rsmi_func_id_iter_handle_t var_iter; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Get open variants iterator - status = (*rsmi_dev_supported_variant_iterator_open_ptr)(iter_handle, &var_iter); - if (status == RSMI_STATUS_NO_DATA) - { - // No variants - ret = _rocmon_smi_add_event_to_device(device, funcname, ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - return 0; - } - - // Variants available -> iterate them - do { - // Get variant information - (*rsmi_func_iter_value_get_ptr)(var_iter, &value); - - // Get function subvariants - ret = _rocmon_smi_get_function_subvariants(device, funcname, value.id, var_iter); - if (ret < 0) return -1; - - // Advance iterator - status = (*rsmi_func_iter_next_ptr)(var_iter); - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - (*rsmi_dev_supported_func_iterator_close_ptr)(&var_iter); - - return 0; -} - - -static int -_rocmon_smi_get_functions(RocmonDevice* device) -{ - rsmi_func_id_iter_handle_t iter_handle; - rsmi_func_id_value_t value; - rsmi_status_t status; - int ret; - - // Open iterator - //(*rsmi_dev_supported_func_iterator_open_ptr)(device->deviceId, &iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_open, (device->deviceId, &iter_handle), { - return -1; - }); - - do - { - // Get function information - //(*rsmi_func_iter_value_get_ptr)(iter_handle, &value); - RSMI_CALL(rsmi_func_iter_value_get, (iter_handle, &value), { - ERROR_PRINT(Failed to get smi function value for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - }); - - // Get function variants - ret = _rocmon_smi_get_function_variants(device, value.name, iter_handle); - if (ret < 0) - { - ERROR_PRINT(Failed to get smi function variants for device %d, device->deviceId); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - return -1; - } - - // Advance iterator (cannot use RSMI_CALL macro here because we have an assignment, - // so we check that the function pointer exists to avoid segfaults.) - if (rsmi_func_iter_next_ptr) { - status = (*rsmi_func_iter_next_ptr)(iter_handle); - } - } while (status != RSMI_STATUS_NO_DATA); - - // Close iterator - //(*rsmi_dev_supported_func_iterator_close_ptr)(&iter_handle); - RSMI_CALL(rsmi_dev_supported_func_iterator_close, (&iter_handle), ); - - // Add device independent functions - ret = _rocmon_smi_add_event_to_device(device, "rsmi_compute_process_info_get", ROCMON_SMI_EVENT_TYPE_NORMAL, 0, 0); - if (ret < 0) return -1; - - return 0; -} - -#define ADD_SMI_EVENT(name, type, smifunc, variant, subvariant, extra, measurefunc) if (_rocmon_smi_add_event_to_map(name, type, smifunc, variant, subvariant, extra, measurefunc) < 0) { return -1; } -#define ADD_SMI_EVENT_N(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_NORMAL, smifunc, 0, 0, extra, measurefunc) -#define ADD_SMI_EVENT_V(name, smifunc, variant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_VARIANT, smifunc, variant, 0, extra, measurefunc) -#define ADD_SMI_EVENT_S(name, smifunc, variant, subvariant, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_SUBVARIANT, smifunc, variant, subvariant, extra, measurefunc) -#define ADD_SMI_EVENT_I(name, smifunc, extra, measurefunc) ADD_SMI_EVENT(name, ROCMON_SMI_EVENT_TYPE_INSTANCES, smifunc, 0, 0, extra, measurefunc) - -static int -_rocmon_smi_add_event_to_map(char* name, RocmonSmiEventType type, char* smifunc, uint64_t variant, uint64_t subvariant, uint64_t extra, RocmonSmiMeasureFunc measureFunc) -{ - // Add new event list to map (if not already present) - bstring label = _rocmon_smi_build_label(type, smifunc, variant, subvariant); - RocmonSmiEventList* list; - if (get_smap_by_key(rocmon_context->smiEvents, bdata(label), (void**)&list) < 0) - { - // Allocate memory for event list - list = (RocmonSmiEventList*) malloc(sizeof(RocmonSmiEventList)); - if (list == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event list %s, name); - return -ENOMEM; - } - list->entries = NULL; - list->numEntries = 0; - - add_smap(rocmon_context->smiEvents, bdata(label), list); - } - bdestroy(label); - - // Allocate memory for another event in list - list->numEntries++; - list->entries = (RocmonSmiEvent*) realloc(list->entries, list->numEntries * sizeof(RocmonSmiEvent)); - if (list->entries == NULL) - { - ERROR_PRINT(Failed to allocate memory for SMI event %s, name); - return -ENOMEM; - } - - // Set event properties - RocmonSmiEvent* event = &list->entries[list->numEntries-1]; - strncpy(event->name, name, sizeof(event->name)); - event->name[sizeof(event->name)] = '\0'; - event->type = type; - event->variant = variant; - event->subvariant = subvariant; - event->extra = extra; - event->instances = 0; // gets set when scanning supported device functions - event->measureFunc = measureFunc; - - return 0; -} - - -static void -_rcomon_smi_free_event_list(void* vlist) -{ - RocmonSmiEventList* list = (RocmonSmiEventList*)vlist; - if (list) - { - FREE_IF_NOT_NULL(list->entries); - free(list); - } -} - - -static int -_rocmon_smi_init_events() -{ - int ret; - - // Init map - ret = init_map(&rocmon_context->smiEvents, MAP_KEY_TYPE_STR, 0, &_rcomon_smi_free_event_list); - if (ret < 0) + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy devices); + free(rocmon_context->devices); + rocmon_context->devices = NULL; + rocmon_context->numDevices = 0; + } + if (rocmon_context->groups) { - ERROR_PRINT(Failed to create map for ROCm SMI events); - return -1; - } - - // Add events - ADD_SMI_EVENT_N("PCI_THROUGHPUT_SENT", "rsmi_dev_pci_throughput_get", 0, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_RECEIVED", "rsmi_dev_pci_throughput_get", 1, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_THROUGHPUT_MAX_PKT_SZ", "rsmi_dev_pci_throughput_get", 2, &_smi_wrapper_pci_throughput_get ); - ADD_SMI_EVENT_N("PCI_REPLAY_COUNTER", "rsmi_dev_pci_replay_counter_get", 0, &_smi_wrapper_pci_replay_counter_get ); - ADD_SMI_EVENT_I("POWER_AVE", "rsmi_dev_power_ave_get", 0, &_smi_wrapper_power_ave_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_VIS_VRAM", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_TOTAL_GTT", "rsmi_dev_memory_total_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_total_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_VIS_VRAM", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_VIS_VRAM, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_V("MEMORY_USAGE_GTT", "rsmi_dev_memory_usage_get", RSMI_MEM_TYPE_GTT, 0, &_smi_wrapper_memory_usage_get ); - ADD_SMI_EVENT_N("MEMORY_BUSY_PERCENT", "rsmi_dev_memory_busy_percent_get", 0, &_smi_wrapper_memory_busy_percent_get ); - ADD_SMI_EVENT_N("MEMORY_NUM_RESERVED_PAGES", "rsmi_dev_memory_reserved_pages_get", 0, &_smi_wrapper_memory_reserved_pages_get ); - ADD_SMI_EVENT_I("FAN_RPMS", "rsmi_dev_fan_rpms_get", 0, &_smi_wrapper_fan_rpms_get ); - ADD_SMI_EVENT_I("FAN_SPEED", "rsmi_dev_fan_speed_get", 0, &_smi_wrapper_fan_speed_get ); - ADD_SMI_EVENT_I("FAN_SPEED_MAX", "rsmi_dev_fan_speed_max_get", 0, &_smi_wrapper_fan_speed_max_get ); - ADD_SMI_EVENT_S("TEMP_EDGE", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_EDGE, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_JUNCTION", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_JUNCTION, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("TEMP_MEMORY", "rsmi_dev_temp_metric_get", RSMI_TEMP_CURRENT, RSMI_TEMP_TYPE_MEMORY, 0, &_smi_wrapper_temp_metric_get ); - ADD_SMI_EVENT_S("VOLT_VDDGFX", "rsmi_dev_volt_metric_get", RSMI_VOLT_CURRENT, RSMI_VOLT_TYPE_VDDGFX, 0, &_smi_wrapper_volt_metric_get ); - ADD_SMI_EVENT_N("OVERDRIVE_LEVEL", "rsmi_dev_overdrive_level_get", 0, &_smi_wrapper_overdrive_level_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_UMC_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_UMC, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SDMA_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SDMA, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_GFX_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_GFX, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MMHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MMHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_ATHUB_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_ATHUB, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_PCIE_BIF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_PCIE_BIF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_HDP_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_HDP, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_XGMI_WAFL_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_XGMI_WAFL, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_DF_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_DF, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SMN_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SMN, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_SEM_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_SEM, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP0_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP0, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_MP1_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_MP1, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_FUSE_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_FUSE, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_CORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 0, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_V("ECC_COUNT_LAST_UNCORRECTABLE", "rsmi_dev_ecc_count_get", RSMI_GPU_BLOCK_LAST, 1, &_smi_wrapper_ecc_count_get ); - ADD_SMI_EVENT_N("PROCS_USING_GPU", "rsmi_compute_process_info_get", 0, &_smi_wrapper_compute_process_info_get ); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy groups); + free(rocmon_context->groups); + rocmon_context->groups = NULL; + rocmon_context->numGroups = 0; + rocmon_context->numActiveGroups = 0; + rocmon_context->activeGroup = -1; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Destroy context); + free(rocmon_context); + rocmon_context = NULL; - return 0; + rocmon_initialized = FALSE; + return; } - int rocmon_init(int numGpus, const int* gpuIds) { - hsa_status_t status; + int err = 0; // check if already initialized if (rocmon_initialized) { return 0; } - if (rocmon_context != NULL) - { - return -EEXIST; - } - // Validate arguments if (numGpus <= 0) { ERROR_PRINT(Number of gpus must be greater than 0 but only %d given, numGpus); return -EINVAL; } - - // Initialize other parts - init_configuration(); - - // initialize libraries - int ret = _rocmon_link_libraries(); - if (ret < 0) + if (!gpuIds) { - ERROR_PLAIN_PRINT(Failed to initialize libraries); - return ret; + ERROR_PRINT(Invalid GPU list); + return -EINVAL; } + // Initialize other parts + init_configuration(); + // Allocate memory for context rocmon_context = (RocmonContext*) malloc(sizeof(RocmonContext)); if (rocmon_context == NULL) @@ -1178,167 +167,133 @@ rocmon_init(int numGpus, const int* gpuIds) ERROR_PLAIN_PRINT(Cannot allocate Rocmon context); return -ENOMEM; } + memset(rocmon_context, 0, sizeof(RocmonContext)); rocmon_context->groups = NULL; - rocmon_context->numGroups = 0; - rocmon_context->numActiveGroups = 0; - - rocmon_context->devices = (RocmonDevice*) malloc(numGpus * sizeof(RocmonDevice)); - rocmon_context->numDevices = numGpus; - if (rocmon_context->devices == NULL) + rocmon_context->devices = NULL; + +#ifdef LIKWID_ROCPROF_SDK + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler SDK); + err = rocmon_sdk_init(rocmon_context, numGpus, gpuIds); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler SDK returned %d, err); +#else + err = -1; +#endif + if (err != 0) { - ERROR_PLAIN_PRINT(Cannot allocate set of GPUs); - free(rocmon_context); - rocmon_context = NULL; - return -ENOMEM; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RocProfiler V1); + err = rocmon_v1_init(rocmon_context, numGpus, gpuIds); + if (err == 0) + { + rocmon_context->use_rocprofiler_v1 = 1; + } + else + { + ERROR_PRINT(Failed to initialize Rocprofiler v1 and SDK); + free(rocmon_context); + rocmon_context = NULL; + return err; + } } - - // init hsa library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing HSA); - ROCM_CALL(hsa_init, (), - { - ERROR_PLAIN_PRINT(Failed to init hsa library); - goto rocmon_init_hsa_failed; - }); - - // init rocm smi library - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing RSMI); - RSMI_CALL(rsmi_init, (0), - { - ERROR_PLAIN_PRINT(Failed to init rocm_smi); - goto rocmon_init_rsmi_failed; - }); - - // Get hsa timestamp factor - uint64_t frequency_hz; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Getting HSA timestamp factor); - ROCM_CALL(hsa_system_get_info, (HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency_hz), - { - ERROR_PLAIN_PRINT(Failed to get HSA timestamp factor); - goto rocmon_init_info_agents_failed; - }); - rocmon_context->hsa_timestamp_factor = (long double)1000000000 / (long double)frequency_hz; - - // initialize structures for specified devices (fetch ROCm specific info) - iterate_agents_cb_arg arg = { - .context = rocmon_context, - .numGpus = numGpus, - .gpuIds = gpuIds, - }; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Iterating through %d available agents, numGpus); - ROCM_CALL(hsa_iterate_agents, (_rocmon_iterate_agents_callback, &arg), + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Initializing ROCm SMI); + err = rocmon_smi_init(rocmon_context, numGpus, gpuIds); + if (err != 0) { - ERROR_PRINT(Error while iterating through available agents); - goto rocmon_init_info_agents_failed; - }); + // Only fail if there are no devices -> neither v1 nor sdk added them + if (rocmon_context->devices == NULL) + { + ERROR_PRINT(Failed to initialize Rocprofiler SMI); + free(rocmon_context); + rocmon_context = NULL; + return err; + } + } + rocmon_context->state = ROCMON_STATE_INITIALIZED; + rocmon_initialized = TRUE; + return err; +} - // Get available SMI events for devices - _rocmon_smi_init_events(); - for (int i = 0; i < rocmon_context->numDevices; i++) +int find_colon(const char* str) +{ + for (int i = 0; i < strlen(str); i++) { - if (_rocmon_smi_get_functions(&rocmon_context->devices[i]) < 0) + if (str[i] == ':') { - ERROR_PRINT(Failed to get SMI functions for device %d, rocmon_context->devices[i].deviceId); - goto rocmon_init_info_agents_failed; + return 1; } } - - rocmon_initialized = TRUE; return 0; -rocmon_init_info_agents_failed: - RSMI_CALL(rsmi_shut_down, (), { - // fall through - }); -rocmon_init_rsmi_failed: - ROCM_CALL(hsa_shut_down, (), { - // fall through - }); -rocmon_init_hsa_failed: - free(rocmon_context->devices); - free(rocmon_context); - rocmon_context = NULL; - return -1; } - -void -rocmon_finalize(void) +static int +_rocmon_parse_eventstring(const char* eventString, const char* arch, GroupInfo* group) { - RocmonContext* context = rocmon_context; + int err = 0; + const char colon = ':'; + Configuration_t config = get_configuration(); - if (!rocmon_initialized) + if ((strstr(eventString, &colon) != NULL) || (find_colon(eventString))) { - return; + // If custom group -> perfgroup_customGroup + err = perfgroup_customGroup(eventString, group); + if (err < 0) + { + ERROR_PRINT(Cannot transform %s to performance group, eventString); + return err; + } } - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Finalize LIKWID ROCMON); - - if (context) + else { - if (context->devices) + // If performance group -> perfgroup_readGroup + err = perfgroup_readGroup(config->groupPath, arch, eventString, group); + if (err == -EACCES) { - // Free each devices fields - for (int i = 0; i < context->numDevices; i++) - { - RocmonDevice* device = &context->devices[i]; - FREE_IF_NOT_NULL(device->rocMetrics); - FREE_IF_NOT_NULL(device->activeRocEvents); - FREE_IF_NOT_NULL(device->activeSmiEvents); - if (device->groupResults) - { - // Free events of event result lists - for (int j = 0; j < device->numGroupResults; j++) - { - FREE_IF_NOT_NULL(device->groupResults[i].results); - } - // Free list - free(device->groupResults); - } - if (device->context) - { - ROCM_CALL(rocprofiler_close, (device->context),); - } - destroy_smap(device->smiMetrics); - } - - free(context->devices); - context->devices = NULL; + ERROR_PRINT(Access to performance group %s not allowed, eventString); + return err; + } + else if (err == -ENODEV) + { + ERROR_PRINT(Performance group %s only available with deactivated HyperThreading, eventString); + return err; + } + if (err < 0) + { + ERROR_PRINT(Cannot read performance group %s for %s, eventString, arch); + return err; } - - FREE_IF_NOT_NULL(context->groups); - destroy_smap(context->smiEvents); - - free(context); - context = NULL; } - RSMI_CALL(rsmi_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown SMI); - // fall through - }); - ROCM_CALL(hsa_shut_down, (), { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Shutdown HSA); - // fall through - }); + return 0; } - int rocmon_addEventSet(const char* eventString, int* gid) { + int ret = 0; + GroupInfo group = {}; // Check arguments - if (!eventString) + if ((!gid) || (!eventString)) { return -EINVAL; } - + // Ensure rocmon is initialized if (!rocmon_initialized) { + ERROR_PRINT(ROCMON not initialized); return -EFAULT; } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding Eventstring %s, eventString); + ret = _rocmon_parse_eventstring(eventString, rocprofiler_group_arch, &group); + if (ret < 0) + { + return ret; + } + // Allocate memory for event group if necessary if (rocmon_context->numActiveGroups == rocmon_context->numGroups) { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Increasing group space to %d, rocmon_context->numGroups+1); GroupInfo* tmpInfo = (GroupInfo*) realloc(rocmon_context->groups, (rocmon_context->numGroups+1) * sizeof(GroupInfo)); if (tmpInfo == NULL) { @@ -1349,26 +304,21 @@ rocmon_addEventSet(const char* eventString, int* gid) rocmon_context->numGroups++; } - // Parse event string - int err = _rocmon_parse_eventstring(eventString, &rocmon_context->groups[rocmon_context->numActiveGroups]); - if (err < 0) - { - return err; - } - // Allocate memory for event results + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Allocate result space); for (int i = 0; i < rocmon_context->numDevices; i++) { RocmonDevice* device = &rocmon_context->devices[i]; // Allocate memory for event results - int numEvents = rocmon_context->groups[rocmon_context->numActiveGroups].nevents; + int numEvents = group.nevents; RocmonEventResult* tmpResults = (RocmonEventResult*) malloc(numEvents * sizeof(RocmonEventResult)); if (tmpResults == NULL) { ERROR_PLAIN_PRINT(Cannot allocate event results); return -ENOMEM; } + memset(tmpResults, 0, numEvents * sizeof(RocmonEventResult)); // Allocate memory for new event result list entry RocmonEventResultList* tmpGroupResults = (RocmonEventResultList*) realloc(device->groupResults, (device->numGroupResults+1) * sizeof(RocmonEventResultList)); @@ -1377,194 +327,20 @@ rocmon_addEventSet(const char* eventString, int* gid) ERROR_PLAIN_PRINT(Cannot allocate new event group result list); return -ENOMEM; } - device->groupResults = tmpGroupResults; device->groupResults[device->numGroupResults].results = tmpResults; device->groupResults[device->numGroupResults].numResults = numEvents; device->numGroupResults++; } + rocmon_context->groups[rocmon_context->numActiveGroups] = group; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Eventstring %s got GID %d, eventString, rocmon_context->numActiveGroups); *gid = rocmon_context->numActiveGroups; rocmon_context->numActiveGroups++; return 0; } -static int -_rocmon_setupCounters_rocprofiler(RocmonDevice* device, const char** events, int numEvents) -{ - // Close previous rocprofiler context - if (device->context) - { - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Closing previous rocprofiler context); - ROCM_CALL(rocprofiler_close, (device->context), return -1); - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create feature array to monitor - rocprofiler_feature_t* features = (rocprofiler_feature_t*) malloc(numEvents * sizeof(rocprofiler_feature_t)); - if (features == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate feature list); - return -ENOMEM; - } - for (int i = 0; i < numEvents; i++) - { - features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[i].name = events[i]; - ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP EVENT %d %s, i, events[i]); - } - - // Free previous feature array if present - FREE_IF_NOT_NULL(device->activeRocEvents); - - device->numActiveRocEvents = numEvents; - device->activeRocEvents = features; - - // Open context - rocprofiler_properties_t properties = {}; - properties.queue_depth = 128; - uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | ROCPROFILER_MODE_SINGLEGROUP; - - // Important: only a single profiling group is supported at this time which limits the number of events that can be monitored at a time. - ROCM_CALL(rocprofiler_open, (device->hsa_agent, device->activeRocEvents, device->numActiveRocEvents, &device->context, mode, &properties), return -1); - - return 0; -} - - -static int -_rocmon_setupCounters_smi(RocmonDevice* device, const char** events, int numEvents) -{ - int ret; - const int instanceNumLen = 5; - - // Delete previous events - if (device->activeSmiEvents) - { - device->activeSmiEvents = NULL; - device->numActiveSmiEvents = 0; - } - - // Look if the are any events - if (numEvents <= 0) - { - return 0; - } - - // Create event array - RocmonSmiEvent* activeEvents = (RocmonSmiEvent*) malloc(numEvents * sizeof(RocmonSmiEvent)); - if (activeEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate active event list); - return -ENOMEM; - } - - for (int i = 0; i < numEvents; i++) - { - char eventName[membersize(RocmonSmiEvent, name)]; - int instance = -1; - - // Parse event name -> normal event vs one with multiple instances (EVENT[0]) - const char* event = events[i]; - char* instancePart = strrchr(event, '['); - if (instancePart != NULL) - { - char withoutBrackets[instanceNumLen+1]; // +1 is '\0' - int partlen = strlen(instancePart); - - // Check if number fit in 'withoutBrackets' - if (partlen - 2 > instanceNumLen) - { - ERROR_PRINT(Instance number in '%s' is too large, event); - free(activeEvents); - return -EINVAL; - } - - // Copy instance number without brackets - strncpy(withoutBrackets, instancePart+1, partlen-2); - withoutBrackets[instanceNumLen] = '\0'; - - // Parse instance as number - char* endParsed; - instance = strtol(withoutBrackets, &endParsed, 10); - - // Check if parsing was successful - char* endOfString = &withoutBrackets[partlen-2]; - if (endParsed != endOfString) - { - ERROR_PRINT(Failed to parse instance number in '%s', event); - free(activeEvents); - return -EINVAL; - } - - // Copy event name without instance - int eventNameLen = instancePart - event; - strncpy(eventName, event, eventNameLen); - eventName[eventNameLen] = '\0'; - } - else - { - // Copy entire event name - strncpy(eventName, event, membersize(RocmonSmiEvent, name)); - } - - // Lookup event in available events - RocmonSmiEvent* metric = NULL; - ret = get_smap_by_key(device->smiMetrics, eventName, (void**)&metric); - if (ret < 0) - { - ERROR_PRINT(RSMI event '%s' not found for device %d, eventName, device->deviceId); - free(activeEvents); - return -EINVAL; - } - - // Copy event - RocmonSmiEvent* tmpEvent = &activeEvents[i]; - memcpy(tmpEvent, metric, sizeof(RocmonSmiEvent)); - - // Check if event supports instances - if (instance >= 0 && tmpEvent->type != ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(Instance number given but event '%s' does not support one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event requires instances - if (instance < 0 && tmpEvent->type == ROCMON_SMI_EVENT_TYPE_INSTANCES) - { - ERROR_PRINT(No instance number given but event '%s' requires one, eventName); - free(activeEvents); - return -EINVAL; - } - - // Check if event has enough instances - if (instance >= 0 && instance >= metric->instances) - { - ERROR_PRINT(Instance %d seleced but event '%s' has only %d, instance, eventName, metric->instances); - free(activeEvents); - return -EINVAL; - } - - // Set instance number - if (instance >= 0) - { - tmpEvent->subvariant = instance; - } - } - - device->activeSmiEvents = activeEvents; - device->numActiveSmiEvents = numEvents; - - return 0; -} - int rocmon_setupCounters(int gid) @@ -1574,12 +350,19 @@ rocmon_setupCounters(int gid) // Check arguments if (gid < 0 || gid >= rocmon_context->numActiveGroups) { + ERROR_PRINT(Invalid eventset ID %d, gid); return -EINVAL; } // Ensure rocmon is initialized if (!rocmon_initialized) { + ERROR_PRINT(Rocmon not initialized); + return -EFAULT; + } + if ((rocmon_context->state != ROCMON_STATE_STOPPED) && (rocmon_context->state != ROCMON_STATE_INITIALIZED)) + { + ERROR_PRINT(Rocmon not in a valid state to setup -> %d, rocmon_context->state); return -EFAULT; } @@ -1589,23 +372,8 @@ rocmon_setupCounters(int gid) // // Separate rocprofiler and SMI events // - const char **smiEvents = NULL, **rocEvents = NULL; int numSmiEvents = 0, numRocEvents = 0; - // Allocate memory for string arrays - smiEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (smiEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate smiEvent name array); - return -ENOMEM; - } - rocEvents = (const char**) malloc(group->nevents * sizeof(const char*)); - if (rocEvents == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate rocEvent name array); - free(smiEvents); - return -ENOMEM; - } // Go through each event and sort it for (int i = 0; i < group->nevents; i++) @@ -1614,13 +382,11 @@ rocmon_setupCounters(int gid) if (strncmp(name, "RSMI_", 5) == 0) { // RSMI event - smiEvents[numSmiEvents] = name + 5; // +5 removes 'RSMI_' prefix numSmiEvents++; } else if (strncmp(name, "ROCP_", 5) == 0) { // Rocprofiler event - rocEvents[numRocEvents] = name + 5; // +5 removes 'ROCP_' prefix numRocEvents++; } else @@ -1631,199 +397,302 @@ rocmon_setupCounters(int gid) } } - // Add events to each device for (int i = 0; i < rocmon_context->numDevices; i++) { RocmonDevice* device = &rocmon_context->devices[i]; + device->numActiveSmiEvents = 0; + device->numActiveRocEvents = 0; + } + + // Add rocprofiler events + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCPROFILER WITH %d events, numRocEvents); + if (rocmon_context->use_rocprofiler_v1) + { + ret = rocmon_v1_setupCounters(rocmon_context, gid); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ret = rocmon_sdk_setupCounters(rocmon_context, gid); + } +#endif + if (ret < 0) + { + ERROR_PRINT(Setting up rocprofiler counters failed); +/* free(smiEvents);*/ +/* free(rocEvents);*/ + return ret; + } - // Add rocprofiler events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCPROFILER WITH %d events, numRocEvents); - ret = _rocmon_setupCounters_rocprofiler(device, rocEvents, numRocEvents); + // Add SMI events + if (numSmiEvents > 0) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, SETUP ROCM SMI WITH %d events, numSmiEvents); + ret = rocmon_smi_setupCounters(rocmon_context, gid); if (ret < 0) { - free(smiEvents); - free(rocEvents); + ERROR_PRINT(Setting up SMI counters failed); +/* free(smiEvents);*/ +/* free(rocEvents);*/ return ret; } - - // Add SMI events - ROCMON_DEBUG_PRINT(DEBUGLEV_INFO, SETUP ROCM SMI WITH %d events, numSmiEvents); - ret = _rocmon_setupCounters_smi(device, smiEvents, numSmiEvents); - if (ret < 0) + } + else + { + for (int i = 0; i < rocmon_context->numDevices; i++) { - free(smiEvents); - free(rocEvents); - return ret; + RocmonDevice* device = &rocmon_context->devices[i]; + device->numActiveSmiEvents = 0; } } + + // Add events to each device + for (int i = 0; i < rocmon_context->numDevices; i++) + { + RocmonDevice* device = &rocmon_context->devices[i]; + device->activeGroup = gid; + } rocmon_context->activeGroup = gid; + rocmon_context->state = ROCMON_STATE_SETUP; +/* // Cleanup*/ +/* free(smiEvents);*/ +/* free(rocEvents);*/ + + return 0; +} - // Cleanup - free(smiEvents); - free(rocEvents); + +int +rocmon_startCounters(void) +{ + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) + { + ERROR_PRINT(ROCMON not initialized); + return -EFAULT; + } + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_SETUP)) + { + ERROR_PRINT(No eventset configured for ROCMON); + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_startCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_startCounters(rocmon_context); + } +#endif + if (ret < 0) + { + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Starting ROCMON SMI counters); + ret = rocmon_smi_startCounters(rocmon_context); + if (ret < 0) + { + return ret; + } + rocmon_context->state = ROCMON_STATE_RUNNING; return 0; } -static int -_rocmon_startCounters_rocprofiler(RocmonDevice* device) +int +rocmon_stopCounters(void) { - // Check if there are any counters to start - if (device->numActiveRocEvents <= 0) + int ret = 0; + + // Ensure rocmon is initialized + if (!rocmon_initialized) { - return 0; + return -EFAULT; } - - // Reset results - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveRocEvents; i++) + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_RUNNING)) { - RocmonEventResult* result = &groupResult->results[i]; - result->lastValue = 0; - result->fullValue = 0; + return -EFAULT; } - - if (device->context) + if (rocmon_context->use_rocprofiler_v1) { - ROCM_CALL(rocprofiler_start, (device->context, 0), return -1); + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_stopCounters(rocmon_context); } - - return 0; -} - - -static int -_rocmon_startCounters_smi(RocmonDevice* device) -{ - // Check if there are any counters to start - if (device->numActiveSmiEvents <= 0) +#ifdef LIKWID_ROCPROF_SDK + else { - return 0; + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_stopCounters(rocmon_context); } - - // Save baseline values - RocmonEventResultList* groupResult = &device->groupResults[rocmon_context->activeGroup]; - for (int i = 0; i < device->numActiveSmiEvents; i++) +#endif + if (ret < 0) { - double value = 0; - RocmonSmiEvent* event = &device->activeSmiEvents[i]; - RocmonEventResult* result = &groupResult->results[device->numActiveRocEvents+i]; - - // Measure counter - if (event->measureFunc) - { - event->measureFunc(device->deviceId, event, result); - } - - // Save value - result->fullValue = 0; + return ret; } - + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Stopping ROCMON SMI counters); + ret = rocmon_smi_stopCounters(rocmon_context); + if (ret < 0) + { + return ret; + } + rocmon_context->state = ROCMON_STATE_STOPPED; return 0; } - int -rocmon_startCounters(void) +rocmon_readCounters(void) { - int ret; + int ret = 0; // Ensure rocmon is initialized if (!rocmon_initialized) { return -EFAULT; } - - // Get timestamp - uint64_t timestamp; - if (ret = _rocmon_get_timestamp(×tamp)) + if ((rocmon_context->activeGroup < 0) || (rocmon_context->state != ROCMON_STATE_RUNNING)) + { + return -EFAULT; + } + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON rocprofiler_v1 counters); + ret = rocmon_v1_readCounters(rocmon_context); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON rocprofiler_sdk counters); + ret = rocmon_sdk_readCounters(rocmon_context); + } +#endif + if (ret < 0) { + ERROR_PRINT(Failed to read ROCMON rocprofiler counters); return ret; } - - // Start counters on each device - for (int i = 0; i < rocmon_context->numDevices; i++) + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Reading ROCMON SMI counters); + ret = rocmon_smi_readCounters(rocmon_context); + if (ret < 0) { - RocmonDevice* device = &rocmon_context->devices[i]; - device->time.start = timestamp; - device->time.read = timestamp; - - // Start rocprofiler events - ret = _rocmon_startCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Start SMI events - _rocmon_startCounters_smi(device); - if (ret < 0) return ret; + ERROR_PRINT(Failed to read ROCMON SMI counters); + return ret; } - return 0; } -static int -_rocmon_stopCounters_rocprofiler(RocmonDevice* device) +int +rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) { - if (device->context) + int ret = 0; + EventList_rocm_t l = malloc(sizeof(EventList_rocm)); + if (!l) { - // Close context - ROCM_CALL(rocprofiler_stop, (device->context, 0), return -1); + return -ENOMEM; } - + memset(l, 0, sizeof(EventList_rocm)); + if (rocmon_context->use_rocprofiler_v1) + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding RocProfiler V1 events); + ret = rocmon_v1_getEventsOfGpu(rocmon_context, gpuIdx, &l); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding RocProfiler SDK events); + ret = rocmon_sdk_getEventsOfGpu(rocmon_context, gpuIdx, &l); + } +#endif + if (ret < 0) + { + rocmon_freeEventsOfGpu(l); + return ret; + } + ROCMON_DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding ROCm SMI events); + ret = rocmon_smi_getEventsOfGpu(rocmon_context, gpuIdx, &l); + if (ret < 0) + { + rocmon_freeEventsOfGpu(l); + return ret; + } + *list = l; return 0; } - -int -rocmon_stopCounters(void) +void +rocmon_freeEventsOfGpu(EventList_rocm_t list) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) + if (!list) { - return -EFAULT; + return; } - - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_stop_time); - if (ret < 0) return ret; - - for (int i = 0; i < rocmon_context->numDevices; i++) + if (list->events != NULL) { - RocmonDevice* device = &rocmon_context->devices[i]; - - // Stop rocprofiler events - ret = _rocmon_stopCounters_rocprofiler(device); - if (ret < 0) return ret; - - // Nothing to stop for SMI events + for (int i = 0; i < list->numEvents; i++) + { + Event_rocm_t* event = &list->events[i]; + if (event->name) { + free(event->name); + event->name = NULL; + } + if (event->description) { + free(event->description); + event->description = NULL; + } + } + free(list->events); + list->events = NULL; } - - return 0; + free(list); + return; } int -rocmon_readCounters(void) +rocmon_switchActiveGroup(int newGroupId) { - int ret; - - // Ensure rocmon is initialized - if (!rocmon_initialized) + int ret = 0; + if (rocmon_context->use_rocprofiler_v1) { - return -EFAULT; + ret = rocmon_v1_switchActiveGroup(rocmon_context, newGroupId); + } +#ifdef LIKWID_ROCPROF_SDK + else + { + ret = rocmon_sdk_switchActiveGroup(rocmon_context, newGroupId); + } +#endif + if (ret < 0) + { + return ret; + } + ret = rocmon_smi_switchActiveGroup(rocmon_context, newGroupId); + if (ret < 0) + { + return ret; } + return 0; +} - // Read counters - ret = _rocmon_readCounters(&_rocmon_get_read_time); - if (ret < 0) return ret; - return 0; + +void rocmon_setVerbosity(int level) +{ + if (level >= DEBUGLEV_ONLY_ERROR && level <= DEBUGLEV_DEVELOP) + { + likwid_rocmon_verbosity = level; + } } + double rocmon_getResult(int gpuIdx, int groupId, int eventId) { @@ -1893,173 +762,6 @@ rocmon_getLastResult(int gpuIdx, int groupId, int eventId) } -int -rocmon_getEventsOfGpu(int gpuIdx, EventList_rocm_t* list) -{ - // Ensure rocmon is initialized - if (!rocmon_initialized) - { - return -EFAULT; - } - - // Validate args - if (gpuIdx < 0 || gpuIdx > rocmon_context->numDevices) - { - return -EINVAL; - } - if (list == NULL) - { - return -EINVAL; - } - - RocmonDevice* device = &rocmon_context->devices[gpuIdx]; - - // Allocate list structure - EventList_rocm_t tmpList = (EventList_rocm_t) malloc(sizeof(EventList_rocm)); - if (tmpList == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate event list); - return -ENOMEM; - } - - // Get number of events - printf("NUmber of events %d + %d\n", device->numRocMetrics , get_map_size(device->smiMetrics)); - tmpList->numEvents = device->numRocMetrics + get_map_size(device->smiMetrics); - if (tmpList->numEvents == 0) - { - // No events -> return empty list - tmpList->events = NULL; - *list = tmpList; - return 0; - } - - // Allocate event array - tmpList->events = (Event_rocm_t*) malloc(tmpList->numEvents * sizeof(Event_rocm_t)); - if (tmpList->events == NULL) - { - ERROR_PLAIN_PRINT(Cannot allocate events for event list); - free(tmpList); - return -ENOMEM; - } - - // Copy rocprofiler event information - for (int i = 0; i < device->numRocMetrics; i++) - { - rocprofiler_info_data_t* event = &device->rocMetrics[i]; - Event_rocm_t* out = &tmpList->events[i]; - int len; - - // Copy name - printf("Name %s\n", event->metric.name); - len = strlen(event->metric.name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "ROCP_%s", event->metric.name); - } - - // Copy description - len = strlen(event->metric.description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", event->metric.description); - } - - // Copy instances - out->instances = event->metric.instances; - } - - // Copy ROCm SMI metric information - for (int i = 0; i < get_map_size(device->smiMetrics); i++) - { - RocmonSmiEvent* event = NULL; - Event_rocm_t* out = &tmpList->events[device->numRocMetrics + i]; - int len; - - // Get event - if (get_smap_by_idx(device->smiMetrics, i, (void**)&event) < 0) - { - continue; - } - - // Copy name - len = strlen(event->name) + 5 /* Prefix */ + 1 /* NULL byte */; - out->name = (char*) malloc(len); - if (out->name) - { - snprintf(out->name, len, "RSMI_%s", event->name); - } - - // Copy description - char* description = "SMI Event"; // TODO: use real descriptions - len = strlen(description) + 1 /* NULL byte */; - out->description = (char*) malloc(len); - if (out->description) - { - snprintf(out->description, len, "%s", description); - } - - // Copy instances - out->instances = event->instances; - } - - *list = tmpList; - return 0; -} - -void -rocmon_freeEventsOfGpu(EventList_rocm_t list) -{ -#define FREE_IF_NOT_NULL(var) if ( var ) { free( var ); var = NULL; } - - // Check pointer - if (list == NULL) - { - return; - } - - if (list->events != NULL) - { - for (int i = 0; i < list->numEvents; i++) - { - Event_rocm_t* event = &list->events[i]; - FREE_IF_NOT_NULL(event->name); - FREE_IF_NOT_NULL(event->description); - } - free(list->events); - } - free(list); -} - - -int -rocmon_switchActiveGroup(int newGroupId) -{ - int ret; - - ret = rocmon_stopCounters(); - if (ret < 0) - { - return ret; - } - - ret = rocmon_setupCounters(newGroupId); - if (ret < 0) - { - return ret; - } - - ret = rocmon_startCounters(); - if (ret < 0) - { - return ret; - } - - return 0; -} - - int rocmon_getNumberOfGroups(void) { @@ -2120,54 +822,31 @@ rocmon_getNumberOfMetrics(int groupId) double rocmon_getTimeOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + // Ensure rocmon is initialized + if (!rocmon_initialized) { return -EFAULT; } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.start)); - } - return t*1E-9; + return 0; } double rocmon_getLastTimeOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) + // Ensure rocmon is initialized + if (!rocmon_initialized) { return -EFAULT; } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.stop - device->time.read)); - } - return t*1E-9; + return 0; } double rocmon_getTimeToLastReadOfGroup(int groupId) { - int i = 0; - double t = 0; - if (!rocmon_context || !rocmon_initialized || (groupId < 0) || groupId > rocmon_context->numActiveGroups) - { - return -EFAULT; - } - for (i = 0; i < rocmon_context->numDevices; i++) - { - RocmonDevice* device = &rocmon_context->devices[i]; - t = MAX(t, (double)(device->time.read - device->time.start)); - } - return t*1E-9; + return 0; } @@ -2254,14 +933,14 @@ rocmon_getGroupInfoLong(int groupId) return ginfo->longinfo; } - int rocmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) { init_configuration(); Configuration_t config = get_configuration(); - return perfgroup_getGroups(config->groupPath, "amd_gpu", groups, shortinfos, longinfos); + + return perfgroup_getGroups(config->groupPath, rocprofiler_group_arch, groups, shortinfos, longinfos); } @@ -2271,12 +950,16 @@ rocmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longi perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos); } -void rocmon_setVerbosity(int level) + + +// only used internally by the ROCMON MarkerAPI +GroupInfo* rocmon_get_group(int gid) { - if (level >= DEBUGLEV_ONLY_ERROR && level <= DEBUGLEV_DEVELOP) + if ((gid >= 0) && (gid < rocmon_context->numActiveGroups)) { - likwid_rocmon_verbosity = level; + return &rocmon_context->groups[gid]; } + return NULL; } diff --git a/src/rocmon_marker.c b/src/rocmon_marker.c index 68337239d..976e3ce10 100644 --- a/src/rocmon_marker.c +++ b/src/rocmon_marker.c @@ -39,9 +39,21 @@ #include #include -#include +#include +#include +#include +#ifdef LIKWID_ROCPROF_SDK +#include +#endif +#include + +#ifndef FREE_IF_NOT_NULL +#define FREE_IF_NOT_NULL(x) if (x != NULL) { free(x); x = NULL; } +#endif +#ifndef gettid #define gettid() syscall(SYS_gettid) +#endif #ifndef NAN #define NAN (0.0/0.0) @@ -200,8 +212,6 @@ _rocmon_saveToFile(const char* markerfile) static void _rocmon_finalize(void) { -#define FREE_IF_NOT_NULL(x) if (x != NULL) { free(x); x = NULL; } - // Ensure markers were initialized if (!rocmon_marker_initialized) { @@ -231,6 +241,7 @@ rocmon_markerInit(void) { return; } + printf("rocmon_markerInit\n"); // Get environment variables char* eventStr = getenv("LIKWID_ROCMON_EVENTS"); @@ -242,7 +253,7 @@ rocmon_markerInit(void) // Validate environment variables are set if ((eventStr == NULL) || (gpuStr == NULL) || (gpuFileStr == NULL)) { - fprintf(stderr, "Running without GPU Marker API. Activate GPU Marker API with -m, -G and -W on commandline.\n"); + fprintf(stderr, "Running without Rocmon Marker API. Activate Rocmon Marker API with -m, -I and -R on commandline.\n"); return; } if (verbosityStr != NULL) { @@ -299,7 +310,7 @@ rocmon_markerInit(void) ret = rocmon_init(num_gpus, gpu_ids); if (ret < 0) { - fprintf(stderr,"Error init Rocmon Marker API.\n"); + fprintf(stderr,"Error initializing Rocmon Marker API with %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -314,7 +325,7 @@ rocmon_markerInit(void) ret = rocmon_addEventSet(bdata(gEventStrings->entry[i]), &gpu_groups[i]); if (ret < 0) { - fprintf(stderr,"Error setting up Rocmon Marker API.\n"); + fprintf(stderr,"Error setting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -335,7 +346,7 @@ rocmon_markerInit(void) ret = rocmon_setupCounters(gpu_groups[active_group]); if (ret) { - fprintf(stderr,"Error setting up Rocmon Marker API.\n"); + fprintf(stderr,"Error setting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -347,7 +358,7 @@ rocmon_markerInit(void) ret = rocmon_startCounters(); if (ret) { - fprintf(stderr,"Error starting up Rocmon Marker API.\n"); + fprintf(stderr,"Error starting up Rocmon Marker API: %d\n", ret); free(gpu_ids); free(gpu_maps); free(gpu_groups); @@ -386,6 +397,7 @@ rocmon_markerClose(void) } else { + printf("Saving ROCMON MarkerAPI results to %s\n", markerfile); _rocmon_saveToFile(markerfile); } @@ -718,6 +730,7 @@ rocmon_readMarkerFile(const char* filename) fprintf(stderr, "Error opening file %s\n", filename); } ptr = fgets(buf, sizeof(buf), fp); + printf("# %s\n", buf); ret = sscanf(buf, "%d %d %d", &gpus, ®ions, &groups); if (ret != 3) { @@ -768,6 +781,7 @@ rocmon_readMarkerFile(const char* filename) } while (fgets(buf, sizeof(buf), fp)) { + printf("# %s\n", buf); if (strchr(buf,':')) { int regionid = 0, groupid = -1; @@ -1064,8 +1078,8 @@ rocmon_getMetricOfRegionGpu(int region, int metricId, int gpuId) { return NAN; } - GroupInfo* ginfo = &rocmon_context->groups[rocmMarkerResults[region].groupID]; - if (metricId < 0 || metricId >= ginfo->nmetrics) + GroupInfo* ginfo = rocmon_get_group(rocmMarkerResults[region].groupID); + if ((!ginfo) || (metricId < 0) || (metricId >= ginfo->nmetrics)) { return NAN; } diff --git a/test/test_rocmon.c b/test/test_rocmon.c new file mode 100644 index 000000000..89df579b0 --- /dev/null +++ b/test/test_rocmon.c @@ -0,0 +1,72 @@ +#include +#include + + +#include + + + + + +int main(int argc, char* argv[]) +{ + int gpuId = 0; + int ret = 0; + int gid = -1; + rocmon_setVerbosity(DEBUGLEV_DEVELOP); + ret = rocmon_init(1, &gpuId); + if (ret < 0) + { + printf("rocmon_init failed with %d\n", ret); + return ret; + } + ret = rocmon_addEventSet("ROCP_SQ_WAVES:ROCM0", &gid); + if (ret < 0) + { + printf("rocmon_addEventSet failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Event set ID %d\n", gid); + ret = rocmon_setupCounters(gid); + if (ret < 0) + { + printf("rocmon_setupCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + ret = rocmon_startCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_readCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_readCounters(); + if (ret < 0) + { + printf("rocmon_startCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters running\n"); + ret = rocmon_stopCounters(); + if (ret < 0) + { + printf("rocmon_stopCounters failed with %d\n", ret); + rocmon_finalize(); + return ret; + } + printf("test_rocmon -- Counters stopped\n"); + rocmon_finalize(); + return 0; +}