diff --git a/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o b/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o index aa8e612463..1991cfd50f 100644 Binary files a/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o and b/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o differ diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index a2b104c327..7bbb9ddf5a 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -75,7 +75,7 @@ func NewStats() *Stats { m.ResourceUsage[metricName] = types.NewUInt64StatCollection() } - if gpu.IsGPUCollectionSupported() { + if config.EnabledGPU { m.ResourceUsage[config.GPUSMUtilization] = types.NewUInt64StatCollection() m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() } diff --git a/pkg/config/config.go b/pkg/config/config.go index a58895b4f3..53463e8f79 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -455,3 +455,7 @@ func IsCgroupMetricsEnabled() bool { func IsIRQCounterMetricsEnabled() bool { return ExposeIRQCounterMetrics } + +func SetGpuUsageMetric(metric string) { + GpuUsageMetric = metric +} diff --git a/pkg/metrics/consts/conts.go b/pkg/metrics/consts/conts.go index 3a37dcfe12..786a767118 100644 --- a/pkg/metrics/consts/conts.go +++ b/pkg/metrics/consts/conts.go @@ -90,4 +90,8 @@ var ( config.CgroupfsSystemCPU, config.CgroupfsUserCPU, } + GPUMetricNames = []string{ + config.GPUSMUtilization, + config.GPUMemUtilization, + } ) diff --git a/pkg/metrics/container/metrics.go b/pkg/metrics/container/metrics.go index 99e7655be5..a6b6f41ddd 100644 --- a/pkg/metrics/container/metrics.go +++ b/pkg/metrics/container/metrics.go @@ -79,6 +79,10 @@ func (c *collector) initMetrics() { c.descriptions[name] = desc c.collectors[name] = metricfactory.NewPromCounter(desc) } + for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) { + c.descriptions[name] = desc + c.collectors[name] = metricfactory.NewPromCounter(desc) + } desc := metricfactory.MetricsPromDesc(context, "joules", "_total", "", consts.ContainerEnergyLabels) c.descriptions["total"] = desc diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index 77fac69322..705f24729b 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -114,6 +114,16 @@ func NodeCPUFrequencyMetricsPromDesc(context string) (descriptions map[string]*p return descriptions } +func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) { + descriptions = make(map[string]*prometheus.Desc) + if config.EnabledGPU { + for _, name := range consts.GPUMetricNames { + descriptions[name] = resMetricsPromDesc(context, name, "nvidia-nvml") + } + } + return descriptions +} + func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) { var labels []string switch context { diff --git a/pkg/metrics/process/metrics.go b/pkg/metrics/process/metrics.go index 526cc79fc6..0cf029f9a0 100644 --- a/pkg/metrics/process/metrics.go +++ b/pkg/metrics/process/metrics.go @@ -78,6 +78,10 @@ func (c *collector) initMetrics() { c.descriptions[name] = desc c.collectors[name] = metricfactory.NewPromCounter(desc) } + for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) { + c.descriptions[name] = desc + c.collectors[name] = metricfactory.NewPromCounter(desc) + } } func (c *collector) Describe(ch chan<- *prometheus.Desc) { diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index 5022bfc2ea..82bb22f837 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -66,6 +66,12 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } } + + if config.EnabledGPU { + for _, collectorName := range consts.GPUMetricNames { + CollectResUtil(ch, instance, collectorName, collectors[collectorName]) + } + } } func collect(ch chan<- prometheus.Metric, collector metricfactory.PromMetric, value float64, labelValues []string) { diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go index b50dd0def9..cd87962507 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go @@ -31,6 +31,8 @@ import ( var ( // List of GPU identifiers for the device devices []interface{} + // bool to check if the process utilization collection is supported + processUtilizationSupported bool = true ) type GPUNvml struct { @@ -116,29 +118,56 @@ func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, sin processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000) - processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) - if ret != nvml.SUCCESS { - if ret == nvml.ERROR_NOT_FOUND { - // ignore the error if there is no process running in the GPU - return nil, nil + if processUtilizationSupported { + processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) + if ret != nvml.SUCCESS { + if ret == nvml.ERROR_NOT_FOUND { + // ignore the error if there is no process running in the GPU + return nil, nil + } + processUtilizationSupported = false + } else { + for _, pinfo := range processUtilizationSample { + // pid 0 means no data. + if pinfo.Pid != 0 { + processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{ + Pid: pinfo.Pid, + TimeStamp: pinfo.TimeStamp, + SmUtil: pinfo.SmUtil, + MemUtil: pinfo.MemUtil, + EncUtil: pinfo.EncUtil, + DecUtil: pinfo.DecUtil, + } + } + } } - return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret)) } - - for _, pinfo := range processUtilizationSample { - // pid 0 means no data. - if pinfo.Pid != 0 { - processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{ - Pid: pinfo.Pid, - TimeStamp: pinfo.TimeStamp, - SmUtil: pinfo.SmUtil, - MemUtil: pinfo.MemUtil, - EncUtil: pinfo.EncUtil, - DecUtil: pinfo.DecUtil, + if !processUtilizationSupported { // if processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage + config.GpuUsageMetric = config.GPUMemUtilization + processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses() + if ret != nvml.SUCCESS { + if ret == nvml.ERROR_NOT_FOUND { + // ignore the error if there is no process running in the GPU + return nil, nil + } + return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret)) + } + memoryInfo, ret := device.(nvml.Device).GetMemoryInfo() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get memory info on device %v: %v", device, nvml.ErrorString(ret)) + } + // convert processInfo to processUtilizationSample + for _, pinfo := range processInfo { + // pid 0 means no data. + if pinfo.Pid != 0 { + processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{ + Pid: pinfo.Pid, + MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total), + } + klog.V(5).Infof("pid: %d, memUtil: %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil) } } } - return processAcceleratorMetrics, nil }