Skip to content

Commit

Permalink
metrics: support gpu memory usage based ratio, when process utilizati…
Browse files Browse the repository at this point in the history
…on is unavailable

Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs committed Feb 16, 2024
1 parent cd088c5 commit e373b16
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 19 deletions.
Binary file modified bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o
Binary file not shown.
2 changes: 1 addition & 1 deletion pkg/collector/stats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func NewStats() *Stats {
m.ResourceUsage[metricName] = types.NewUInt64StatCollection()
}

if gpu.IsGPUCollectionSupported() {
if config.EnabledGPU {
m.ResourceUsage[config.GPUSMUtilization] = types.NewUInt64StatCollection()
m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection()
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,7 @@ func IsCgroupMetricsEnabled() bool {
func IsIRQCounterMetricsEnabled() bool {
return ExposeIRQCounterMetrics
}

func SetGpuUsageMetric(metric string) {
GpuUsageMetric = metric
}
4 changes: 4 additions & 0 deletions pkg/metrics/consts/conts.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,8 @@ var (
config.CgroupfsSystemCPU,
config.CgroupfsUserCPU,
}
GPUMetricNames = []string{
config.GPUSMUtilization,
config.GPUMemUtilization,
}
)
4 changes: 4 additions & 0 deletions pkg/metrics/container/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ func (c *collector) initMetrics() {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}

desc := metricfactory.MetricsPromDesc(context, "joules", "_total", "", consts.ContainerEnergyLabels)
c.descriptions["total"] = desc
Expand Down
10 changes: 10 additions & 0 deletions pkg/metrics/metricfactory/metric_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,16 @@ func NodeCPUFrequencyMetricsPromDesc(context string) (descriptions map[string]*p
return descriptions
}

func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) {
descriptions = make(map[string]*prometheus.Desc)
if config.EnabledGPU {
for _, name := range consts.GPUMetricNames {
descriptions[name] = resMetricsPromDesc(context, name, "nvidia-nvml")
}
}
return descriptions
}

func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) {
var labels []string
switch context {
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/process/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ func (c *collector) initMetrics() {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
}

func (c *collector) Describe(ch chan<- *prometheus.Desc) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/metrics/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
}

if config.EnabledGPU {
for _, collectorName := range consts.GPUMetricNames {
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
}
}

func collect(ch chan<- prometheus.Metric, collector metricfactory.PromMetric, value float64, labelValues []string) {
Expand Down
65 changes: 47 additions & 18 deletions pkg/sensors/accelerator/gpu/source/gpu_nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import (
var (
// List of GPU identifiers for the device
devices []interface{}
// bool to check if the process utilization collection is supported
processUtilizationSupported bool = true
)

type GPUNvml struct {
Expand Down Expand Up @@ -116,29 +118,56 @@ func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, sin
processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{}
lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000)

processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp)
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_FOUND {
// ignore the error if there is no process running in the GPU
return nil, nil
if processUtilizationSupported {
processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp)
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_FOUND {
// ignore the error if there is no process running in the GPU
return nil, nil
}
processUtilizationSupported = false
} else {
for _, pinfo := range processUtilizationSample {
// pid 0 means no data.
if pinfo.Pid != 0 {
processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{
Pid: pinfo.Pid,
TimeStamp: pinfo.TimeStamp,
SmUtil: pinfo.SmUtil,
MemUtil: pinfo.MemUtil,
EncUtil: pinfo.EncUtil,
DecUtil: pinfo.DecUtil,
}
}
}
}
return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret))
}

for _, pinfo := range processUtilizationSample {
// pid 0 means no data.
if pinfo.Pid != 0 {
processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{
Pid: pinfo.Pid,
TimeStamp: pinfo.TimeStamp,
SmUtil: pinfo.SmUtil,
MemUtil: pinfo.MemUtil,
EncUtil: pinfo.EncUtil,
DecUtil: pinfo.DecUtil,
if !processUtilizationSupported { // if processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage
config.GpuUsageMetric = config.GPUMemUtilization
processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses()
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_FOUND {
// ignore the error if there is no process running in the GPU
return nil, nil
}
return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret))
}
memoryInfo, ret := device.(nvml.Device).GetMemoryInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get memory info on device %v: %v", device, nvml.ErrorString(ret))
}
// convert processInfo to processUtilizationSample
for _, pinfo := range processInfo {
// pid 0 means no data.
if pinfo.Pid != 0 {
processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{
Pid: pinfo.Pid,
MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total),
}
klog.V(5).Infof("pid: %d, memUtil: %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil)
}
}
}

return processAcceleratorMetrics, nil
}

Expand Down

0 comments on commit e373b16

Please sign in to comment.