metrics: support gpu memory usage based ratio, when process utilizati…

…on is unavailable Signed-off-by: Huamin Chen <[email protected]>
sustainable-computing-io · Feb 16, 2024 · e373b16 · e373b16
1 parent cd088c5
commit e373b16
Show file tree

Hide file tree

Showing 9 changed files with 80 additions and 19 deletions.
diff --git a/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o b/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o
diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go
@@ -75,7 +75,7 @@ func NewStats() *Stats {
 		m.ResourceUsage[metricName] = types.NewUInt64StatCollection()
 	}
 
-	if gpu.IsGPUCollectionSupported() {
+	if config.EnabledGPU {
 		m.ResourceUsage[config.GPUSMUtilization] = types.NewUInt64StatCollection()
 		m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection()
 	}

diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -455,3 +455,7 @@ func IsCgroupMetricsEnabled() bool {
 func IsIRQCounterMetricsEnabled() bool {
 	return ExposeIRQCounterMetrics
 }
+
+func SetGpuUsageMetric(metric string) {
+	GpuUsageMetric = metric
+}
diff --git a/pkg/metrics/consts/conts.go b/pkg/metrics/consts/conts.go
@@ -90,4 +90,8 @@ var (
 		config.CgroupfsSystemCPU,
 		config.CgroupfsUserCPU,
 	}
+	GPUMetricNames = []string{
+		config.GPUSMUtilization,
+		config.GPUMemUtilization,
+	}
 )
diff --git a/pkg/metrics/container/metrics.go b/pkg/metrics/container/metrics.go
@@ -79,6 +79,10 @@ func (c *collector) initMetrics() {
 		c.descriptions[name] = desc
 		c.collectors[name] = metricfactory.NewPromCounter(desc)
 	}
+	for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
+		c.descriptions[name] = desc
+		c.collectors[name] = metricfactory.NewPromCounter(desc)
+	}
 
 	desc := metricfactory.MetricsPromDesc(context, "joules", "_total", "", consts.ContainerEnergyLabels)
 	c.descriptions["total"] = desc

diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go
@@ -114,6 +114,16 @@ func NodeCPUFrequencyMetricsPromDesc(context string) (descriptions map[string]*p
 	return descriptions
 }
 
+func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) {
+	descriptions = make(map[string]*prometheus.Desc)
+	if config.EnabledGPU {
+		for _, name := range consts.GPUMetricNames {
+			descriptions[name] = resMetricsPromDesc(context, name, "nvidia-nvml")
+		}
+	}
+	return descriptions
+}
+
 func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) {
 	var labels []string
 	switch context {

diff --git a/pkg/metrics/process/metrics.go b/pkg/metrics/process/metrics.go
@@ -78,6 +78,10 @@ func (c *collector) initMetrics() {
 		c.descriptions[name] = desc
 		c.collectors[name] = metricfactory.NewPromCounter(desc)
 	}
+	for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
+		c.descriptions[name] = desc
+		c.collectors[name] = metricfactory.NewPromCounter(desc)
+	}
 }
 
 func (c *collector) Describe(ch chan<- *prometheus.Desc) {

diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go
@@ -66,6 +66,12 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac
 			CollectResUtil(ch, instance, collectorName, collectors[collectorName])
 		}
 	}
+
+	if config.EnabledGPU {
+		for _, collectorName := range consts.GPUMetricNames {
+			CollectResUtil(ch, instance, collectorName, collectors[collectorName])
+		}
+	}
 }
 
 func collect(ch chan<- prometheus.Metric, collector metricfactory.PromMetric, value float64, labelValues []string) {

diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go
@@ -31,6 +31,8 @@ import (
 var (
 	// List of GPU identifiers for the device
 	devices []interface{}
+	// bool to check if the process utilization collection is supported
+	processUtilizationSupported bool = true
 )
 
 type GPUNvml struct {
@@ -116,29 +118,56 @@ func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, sin
 	processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{}
 	lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000)
 
-	processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp)
-	if ret != nvml.SUCCESS {
-		if ret == nvml.ERROR_NOT_FOUND {
-			// ignore the error if there is no process running in the GPU
-			return nil, nil
+	if processUtilizationSupported {
+		processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp)
+		if ret != nvml.SUCCESS {
+			if ret == nvml.ERROR_NOT_FOUND {
+				// ignore the error if there is no process running in the GPU
+				return nil, nil
+			}
+			processUtilizationSupported = false
+		} else {
+			for _, pinfo := range processUtilizationSample {
+				// pid 0 means no data.
+				if pinfo.Pid != 0 {
+					processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{
+						Pid:       pinfo.Pid,
+						TimeStamp: pinfo.TimeStamp,
+						SmUtil:    pinfo.SmUtil,
+						MemUtil:   pinfo.MemUtil,
+						EncUtil:   pinfo.EncUtil,
+						DecUtil:   pinfo.DecUtil,
+					}
+				}
+			}
 		}
-		return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret))
 	}
-
-	for _, pinfo := range processUtilizationSample {
-		// pid 0 means no data.
-		if pinfo.Pid != 0 {
-			processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{
-				Pid:       pinfo.Pid,
-				TimeStamp: pinfo.TimeStamp,
-				SmUtil:    pinfo.SmUtil,
-				MemUtil:   pinfo.MemUtil,
-				EncUtil:   pinfo.EncUtil,
-				DecUtil:   pinfo.DecUtil,
+	if !processUtilizationSupported { // if processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage
+		config.GpuUsageMetric = config.GPUMemUtilization
+		processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses()
+		if ret != nvml.SUCCESS {
+			if ret == nvml.ERROR_NOT_FOUND {
+				// ignore the error if there is no process running in the GPU
+				return nil, nil
+			}
+			return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret))
+		}
+		memoryInfo, ret := device.(nvml.Device).GetMemoryInfo()
+		if ret != nvml.SUCCESS {
+			return nil, fmt.Errorf("failed to get memory info on device %v: %v", device, nvml.ErrorString(ret))
+		}
+		// convert processInfo to processUtilizationSample
+		for _, pinfo := range processInfo {
+			// pid 0 means no data.
+			if pinfo.Pid != 0 {
+				processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{
+					Pid:     pinfo.Pid,
+					MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total),
+				}
+				klog.V(5).Infof("pid: %d, memUtil: %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil)
 			}
 		}
 	}
-
 	return processAcceleratorMetrics, nil
 }