diff --git a/go.mod b/go.mod index 5df161a474..4c2b59bf17 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/sustainable-computing-io/kepler go 1.20 require ( + github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f github.com/NVIDIA/go-nvml v0.12.0-1 github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0 github.com/containerd/cgroups v1.1.0 @@ -29,8 +30,10 @@ require ( ) require ( + github.com/Masterminds/semver v1.5.0 // indirect github.com/StackExchange/wmi v1.2.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/bits-and-blooms/bitset v1.13.0 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cilium/ebpf v0.9.1 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect @@ -67,9 +70,11 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/procfs v0.11.1 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/stretchr/testify v1.8.4 // indirect golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.13.0 // indirect diff --git a/go.sum b/go.sum index 1da1f3f146..cf7192d91d 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= +github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8= +github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4= github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM= github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA= @@ -7,6 +11,8 @@ github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0/go.mod h1:UD3Mfr+JZ/ASK2VMu github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= +github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= @@ -130,6 +136,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q= github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY= github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= @@ -156,6 +163,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= diff --git a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go index 3a933a4da3..a1d4377792 100644 --- a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go +++ b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go @@ -46,8 +46,8 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats var err error var processesUtilization map[uint32]gpu_source.ProcessUtilizationSample // calculate the gpu's processes energy consumption for each gpu - for _, device := range gpu.GetGpus() { - if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, time.Since(lastUtilizationTimestamp)); err != nil { + for gpuID, device := range gpu.GetGpus() { + if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, gpuID, time.Since(lastUtilizationTimestamp)); err != nil { klog.Infoln(err) continue } @@ -78,8 +78,9 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats } processStats[uintPid] = stats.NewProcessStats(uintPid, uint64(0), containerID, vmID, command) } - processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.SmUtil)) - processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.MemUtil)) + gpuName := fmt.Sprintf("%s%v", utils.GenericGPUID, gpuID) + processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(gpuName, uint64(processUtilization.SmUtil)) + processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuName, uint64(processUtilization.MemUtil)) } } diff --git a/pkg/metrics/consts/conts.go b/pkg/metrics/consts/conts.go index 786a767118..8b0d583d78 100644 --- a/pkg/metrics/consts/conts.go +++ b/pkg/metrics/consts/conts.go @@ -39,6 +39,7 @@ var ( ContainerResUtilLabels = []string{"container_id", "pod_name", "container_name", "container_namespace"} VMResUtilLabels = []string{"vm_id"} NodeResUtilLabels = []string{"device", "instance"} + GPUResUtilLabels = []string{"gpu_id"} ) var ( diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index 705f24729b..6a132287e8 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -139,12 +139,18 @@ func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) { klog.Errorf("Unexpected prometheus context: %s", context) return } + // if this is a GPU metric, we need to add the GPU ID label + for _, gpuMetric := range consts.GPUMetricNames { + if name == gpuMetric { + labels = append(labels, consts.GPUResUtilLabels...) + } + } return MetricsPromDesc(context, name, consts.UsageMetricNameSuffix, source, labels) } -func MetricsPromDesc(context, name, sufix, source string, labels []string) (desc *prometheus.Desc) { +func MetricsPromDesc(context, name, suffix, source string, labels []string) (desc *prometheus.Desc) { return prometheus.NewDesc( - prometheus.BuildFQName(consts.MetricsNamespace, context, name+sufix), + prometheus.BuildFQName(consts.MetricsNamespace, context, name+suffix), "Aggregated value in "+name+" value from "+source, labels, prometheus.Labels{"source": source}, diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index 82bb22f837..64f6aeaa13 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -122,9 +122,25 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam switch v := instance.(type) { case *stats.ContainerStats: container := instance.(*stats.ContainerStats) - value = float64(container.ResourceUsage[metricName].SumAllAggrValues()) - labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace} - collect(ch, collector, value, labelValues) + // special case for GPU devices, the metrics are reported per device + isGPUMetric := false + for _, m := range consts.GPUMetricNames { + if metricName == m { + isGPUMetric = true + break + } + } + if isGPUMetric { + for deviceID, utilization := range container.ResourceUsage[metricName].Stat { + value = float64(utilization.Aggr) + labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace, deviceID} + collect(ch, collector, value, labelValues) + } + } else { + value = float64(container.ResourceUsage[metricName].SumAllAggrValues()) + labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace} + collect(ch, collector, value, labelValues) + } case *stats.ProcessStats: process := instance.(*stats.ProcessStats) diff --git a/pkg/sensors/accelerator/gpu/gpu.go b/pkg/sensors/accelerator/gpu/gpu.go index 7e7e1a3ffa..7def54d210 100644 --- a/pkg/sensors/accelerator/gpu/gpu.go +++ b/pkg/sensors/accelerator/gpu/gpu.go @@ -34,8 +34,18 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp // init initialize the acceleratorImpl and start it func init() { - acceleratorImpl = &gpu_source.GPUNvml{} + acceleratorImpl = &gpu_source.GPUDcgm{} err := acceleratorImpl.Init() + if err == nil { + klog.Infoln("Using dcgm to obtain gpu power") + // If the library was successfully initialized, we don't need to return an error in the Init() function + errLib = nil + return + } + // if dcgm fail to work, we use nvml + klog.Infof("Failed to init dcgm, err: %v\n", err) + acceleratorImpl = &gpu_source.GPUNvml{} + err = acceleratorImpl.Init() if err == nil { klog.Infoln("Using nvml to obtain gpu power") // If the library was successfully initialized, we don't need to return an error in the Init() function diff --git a/pkg/sensors/accelerator/gpu/power.go b/pkg/sensors/accelerator/gpu/power.go index f344d9a462..17ae228c18 100644 --- a/pkg/sensors/accelerator/gpu/power.go +++ b/pkg/sensors/accelerator/gpu/power.go @@ -39,11 +39,11 @@ type acceleratorInterface interface { // Shutdown stops the GPU metric collector Shutdown() bool // GetGpus returns a map with gpu device - GetGpus() []interface{} + GetGpus() map[string]interface{} // GetAbsEnergyFromGPU returns a map with mJ in each gpu device. Absolute energy is the sum of Idle + Dynamic energy. GetAbsEnergyFromGPU() []uint32 // GetProcessResourceUtilization returns a map of ProcessUtilizationSample where the key is the process pid - GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) + GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) // IsGPUCollectionSupported returns if it is possible to use this collector IsGPUCollectionSupported() bool // SetGPUCollectionSupported manually set if it is possible to use this collector. This is for testing purpose only. @@ -65,11 +65,11 @@ func Shutdown() bool { return true } -func GetGpus() []interface{} { +func GetGpus() map[string]interface{} { if acceleratorImpl != nil && config.EnabledGPU { return acceleratorImpl.GetGpus() } - return []interface{}{} + return map[string]interface{}{} } func GetAbsEnergyFromGPU() []uint32 { @@ -82,9 +82,9 @@ func GetAbsEnergyFromGPU() []uint32 { // GetProcessResourceUtilizationPerDevice tries to collect the GPU metrics. // There is a known issue that some clusters the nvidia GPU can stop to respod and we need to start it again. // See https://github.com/sustainable-computing-io/kepler/issues/610. -func GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) { +func GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) { if acceleratorImpl != nil && config.EnabledGPU { - processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, since) + processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, deviceName, since) if err != nil { klog.Infof("Failed to collect GPU metrics, trying to initizalize again: %v\n", err) err = acceleratorImpl.Init() diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go new file mode 100644 index 0000000000..edff20a38b --- /dev/null +++ b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go @@ -0,0 +1,376 @@ +//go:build gpu +// +build gpu + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package source + +import ( + "fmt" + "strconv" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/sustainable-computing-io/kepler/pkg/config" + "k8s.io/klog/v2" +) + +var ( + deviceFields []dcgm.Short = []dcgm.Short{ + // https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm + dcgm.DCGM_FI_PROF_SM_ACTIVE, + dcgm.DCGM_FI_PROF_SM_OCCUPANCY, + dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, + dcgm.DCGM_FI_DEV_POWER_USAGE, + } + deviceFieldsString = []string{ + "dcgm.DCGM_FI_PROF_SM_ACTIVE", + "dcgm.DCGM_FI_PROF_SM_OCCUPANCY", + "dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE", + "dcgm.DCGM_FI_DEV_POWER_USAGE", + } + ratioFields uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1 + SkipDCGMValue = "SKIPPING DCGM VALUE" + FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" + gpuMigArray [][]MigDevice + totalMultiProcessorCount map[string]int +) + +type GPUDcgm struct { + collectionSupported bool + devices map[string]interface{} + deviceGroupName string + deviceGroupHandle dcgm.GroupHandle + fieldGroupName string + fieldGroupHandle dcgm.FieldHandle + pidGroupName string + pidGroupHandle dcgm.GroupHandle // TODO: wait till https://github.com/NVIDIA/go-dcgm/issues/59 is resolved + entities map[string]dcgm.GroupEntityPair + cleanup func() +} + +func (d *GPUDcgm) GetName() string { + return "dcgm" +} + +func (d *GPUDcgm) Init() error { + d.devices = make(map[string]interface{}) + d.entities = make(map[string]dcgm.GroupEntityPair) + if err := d.initDCGM(); err != nil { + return err + } + + if err := d.initNVML(); err != nil { + d.Shutdown() + return err + } + + if err := d.createDeviceGroup(); err != nil { + d.Shutdown() + return err + } + + if err := d.addDevicesToGroup(); err != nil { + d.Shutdown() + return err + } + + if err := d.createFieldGroup(); err != nil { + d.Shutdown() + return err + } + + if err := d.setupWatcher(); err != nil { + d.Shutdown() + return err + } + + d.collectionSupported = true + return nil +} + +func (d *GPUDcgm) Shutdown() bool { + if d.deviceGroupName != "" { + dcgm.DestroyGroup(d.deviceGroupHandle) + } + if d.fieldGroupName != "" { + dcgm.FieldGroupDestroy(d.fieldGroupHandle) + } + if d.cleanup != nil { + d.cleanup() + } + d.collectionSupported = false + return true +} + +func (d *GPUDcgm) GetAbsEnergyFromGPU() []uint32 { + gpuEnergy := []uint32{} + for _, device := range d.devices { + power, ret := device.(nvml.Device).GetPowerUsage() + if ret != nvml.SUCCESS { + klog.V(2).Infof("failed to get power usage on device %v: %v\n", device, nvml.ErrorString(ret)) + continue + } + // since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, it is + // necessary to calculate the energy consumption for the entire waiting period + energy := uint32(uint64(power) * config.SamplePeriodSec) + gpuEnergy = append(gpuEnergy, energy) + } + return gpuEnergy +} + +func (d *GPUDcgm) GetGpus() map[string]interface{} { + return d.devices +} + +func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { + processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} + + if device == nil { // this is a MIG device, it is already tracked in the parent device + return processAcceleratorMetrics, nil + } + + var vals, miVals []dcgm.FieldValue_v1 + var err error + + klog.Infof("Device %v\n", deviceName) + if device != nil { // this is a GPU + deviceIndex, strErr := strconv.Atoi(deviceName) + if strErr != nil { + klog.Infof("failed to convert %q to an integer: %v", deviceName, strErr) + return processAcceleratorMetrics, strErr + } + vals, err = dcgm.GetLatestValuesForFields(uint(deviceIndex), deviceFields) + if err != nil { + klog.Infof("failed to get latest values for fields: %v", err) + return processAcceleratorMetrics, err + } + gpuSMActive := uint32(0) + if err == nil { + for i, val := range vals { + value := ToString(val) + label := deviceFieldsString[i] + if val.FieldId == ratioFields { + smUtil, _ := strconv.ParseFloat(value, 32) + gpuSMActive = uint32(smUtil * 100) + } + klog.Infof("Device %v Label %v Val: %v", deviceName, label, ToString(val)) + } + klog.Infof("\n") + } + processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses() + if ret != nvml.SUCCESS { + klog.Infof("failed to get running processes: %v", nvml.ErrorString(ret)) + return processAcceleratorMetrics, fmt.Errorf("failed to get running processes: %v", nvml.ErrorString(ret)) + } + for _, p := range processInfo { + // klog.Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId) + if p.GpuInstanceId > 0 { // this is a MIG, get it entity id and reads the related fields + entityName := gpuMigArray[deviceIndex][p.GpuInstanceId].EntityName + multiprocessorCountRatio := gpuMigArray[deviceIndex][p.GpuInstanceId].MultiprocessorCountRatio + mi := d.entities[entityName] + miVals, err = dcgm.EntityGetLatestValues(mi.EntityGroupId, mi.EntityId, deviceFields) + if err == nil { + for i, val := range miVals { + label := deviceFieldsString[i] + value := ToString(val) + klog.Infof("Device %v Label %v Val: %v", entityName, label, value) + if val.FieldId == ratioFields { + floatVal, _ := strconv.ParseFloat(value, 32) + // ratio of active multiprocessors to total multiprocessors + smUtil := uint32(floatVal * 100 * multiprocessorCountRatio) + //klog.Infof("pid %d smUtil %d multiprocessor count ratio %v\n", p.Pid, smUtil, multiprocessorCountRatio) + processAcceleratorMetrics[p.Pid] = ProcessUtilizationSample{ + Pid: p.Pid, + TimeStamp: uint64(time.Now().UnixNano()), + SmUtil: smUtil, + } + } + } + klog.Infof("\n") + } + } else { + processAcceleratorMetrics[p.Pid] = ProcessUtilizationSample{ + Pid: p.Pid, + TimeStamp: uint64(time.Now().UnixNano()), + SmUtil: gpuSMActive, // if this is not a MIG, we will use the GPU SM active value. FIXME: what if there are multiple pids in the same GPU? + } + } + } + } + + return processAcceleratorMetrics, nil +} + +func (d *GPUDcgm) IsGPUCollectionSupported() bool { + return d.collectionSupported +} + +func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) { + d.collectionSupported = supported +} + +// helper functions +func (d *GPUDcgm) initDCGM() error { + cleanup, err := dcgm.Init(dcgm.Embedded) + if err != nil { + if cleanup != nil { + cleanup() + } + return fmt.Errorf("not able to connect to DCGM: %s", err) + } + d.cleanup = cleanup + return nil +} + +func (d *GPUDcgm) initNVML() error { + if ret := nvml.Init(); ret != nvml.SUCCESS { + d.collectionSupported = false + d.Shutdown() + return fmt.Errorf("failed to init nvml. %s", nvmlErrorString(ret)) + } + return nil +} + +func (d *GPUDcgm) createDeviceGroup() error { + deviceGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05") + deviceGroup, err := dcgm.CreateGroup(deviceGroupName) + if err != nil { + return fmt.Errorf("failed to create group %q: %v", deviceGroupName, err) + } + d.deviceGroupName = deviceGroupName + d.deviceGroupHandle = deviceGroup + klog.Infof("Created device group %q", deviceGroupName) + return nil +} + +func (d *GPUDcgm) addDevicesToGroup() error { + supportedDeviceIndices, err := dcgm.GetSupportedDevices() + if err != nil { + return fmt.Errorf("failed to find supported devices: %v", err) + } + klog.Infof("found %d supported devices", len(supportedDeviceIndices)) + for _, gpuIndex := range supportedDeviceIndices { + err = dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU, gpuIndex) + if err != nil { + klog.Infof("failed to add device %d to group %q: %v", gpuIndex, d.deviceGroupName, err) + } else { + device, ret := nvml.DeviceGetHandleByIndex(int(gpuIndex)) + if ret != nvml.SUCCESS { + klog.Infof("failed to get nvml device %d: %v ", gpuIndex, nvml.ErrorString(ret)) + continue + } + d.devices[fmt.Sprintf("%v", gpuIndex)] = device + d.entities[fmt.Sprintf("%v", gpuIndex)] = dcgm.GroupEntityPair{dcgm.FE_GPU, gpuIndex} + } + } + // add entity to the group + hierarchy, err := dcgm.GetGpuInstanceHierarchy() + if err != nil { + d.Shutdown() + return fmt.Errorf("failed to get gpu hierachy: %v", err) + } + + if hierarchy.Count > 0 { + // if MIG is enabled, we need to know the hierarchy as well as the multiprocessor count in each device. + // we will use the multiprocessor count to calculate the utilization of each instance + if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(true); err != nil { + klog.Infof("failed to retrive from nvidia-smi: %v", err) + // if we cannot get the multiprocessor count, we will not be able to calculate the utilization + } + for i := uint(0); i < hierarchy.Count; i++ { + if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU { + // add a GPU instance + info := hierarchy.EntityList[i].Info + entityId := hierarchy.EntityList[i].Entity.EntityId + gpuId := hierarchy.EntityList[i].Parent.EntityId + klog.Infof("gpu id %v entity id %v gpu index %v instance id %v", gpuId, entityId, info.NvmlGpuIndex, info.NvmlInstanceId) + entityName := fmt.Sprintf("entity-%d", entityId) + gpuMigArray[info.NvmlGpuIndex][info.NvmlInstanceId].EntityName = entityName + err = dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU_I, entityId) + d.entities[entityName] = dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId} + klog.Infof("Adding GPU instance %d, err: %v", entityId, err) + } + } + } + return nil +} + +func (d *GPUDcgm) createFieldGroup() error { + fieldGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05") + fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) + if err != nil { + return fmt.Errorf("failed to create field group %q: %v", fieldGroupName, err) + } + d.fieldGroupName = fieldGroupName + d.fieldGroupHandle = fieldGroup + return nil +} + +func (d *GPUDcgm) setupWatcher() error { + err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(config.SamplePeriodSec*1000), 0.0, 1) + if err != nil { + return fmt.Errorf("failed to set up watcher, err %v", err) + } + return nil +} + +// ToString converts a dcgm.FieldValue_v1 to a string +// credit to dcgm_exporter +func ToString(value dcgm.FieldValue_v1) string { + switch v := value.Int64(); v { + case dcgm.DCGM_FT_INT32_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: + return SkipDCGMValue + } + switch v := value.Float64(); v { + case dcgm.DCGM_FT_FP64_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: + return SkipDCGMValue + } + switch v := value.FieldType; v { + case dcgm.DCGM_FT_STRING: + return value.String() + case dcgm.DCGM_FT_DOUBLE: + return fmt.Sprintf("%f", value.Float64()) + case dcgm.DCGM_FT_INT64: + return fmt.Sprintf("%d", value.Int64()) + default: + return FailedToConvert + } + + return FailedToConvert +} diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dummy.go b/pkg/sensors/accelerator/gpu/source/gpu_dummy.go index aa7237ee78..c9200c2b9d 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_dummy.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_dummy.go @@ -21,8 +21,6 @@ package source import ( "time" - - "github.com/NVIDIA/go-nvml/pkg/nvml" ) type GPUDummy struct { @@ -47,13 +45,12 @@ func (d *GPUDummy) GetAbsEnergyFromGPU() []uint32 { return []uint32{} } -func (d *GPUDummy) GetGpus() []interface{} { - var devices []interface{} - devices = append(devices, nvml.Device{}) +func (d *GPUDummy) GetGpus() map[string]interface{} { + var devices map[string]interface{} return devices } -func (n *GPUDummy) GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { +func (n *GPUDummy) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} processAcceleratorMetrics[0] = ProcessUtilizationSample{ Pid: 0, diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go index cd87962507..a078c2d136 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go @@ -30,7 +30,7 @@ import ( var ( // List of GPU identifiers for the device - devices []interface{} + devices map[string]interface{} // bool to check if the process utilization collection is supported processUtilizationSupported bool = true ) @@ -66,7 +66,7 @@ func (n *GPUNvml) Init() (err error) { return err } klog.Infof("found %d gpu devices\n", count) - devices = make([]interface{}, count) + devices = make(map[string]interface{}, count) for i := 0; i < count; i++ { device, ret := nvml.DeviceGetHandleByIndex(i) if ret != nvml.SUCCESS { @@ -76,8 +76,9 @@ func (n *GPUNvml) Init() (err error) { return err } name, _ := device.GetName() - klog.Infoln("GPU", i, name) - devices[i] = device + uuid, _ := device.GetUUID() + klog.Infof("GPU %v %q %q", i, name, uuid) + devices[uuid] = device } n.collectionSupported = true return nil @@ -89,7 +90,7 @@ func (n *GPUNvml) Shutdown() bool { } // GetGpus returns a map with gpu device -func (n *GPUNvml) GetGpus() []interface{} { +func (n *GPUNvml) GetGpus() map[string]interface{} { return devices } @@ -114,7 +115,7 @@ func (n *GPUNvml) GetAbsEnergyFromGPU() []uint32 { // // ProcessUtilizationSample.SmUtil represents the process Streaming Multiprocessors - SM (3D/Compute) utilization in percentage. // ProcessUtilizationSample.MemUtil represents the process Frame Buffer Memory utilization Value. -func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { +func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000) @@ -164,7 +165,7 @@ func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, sin Pid: pinfo.Pid, MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total), } - klog.V(5).Infof("pid: %d, memUtil: %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil) + klog.V(1).Infof("pid: %d, memUtil: %d gpu instance %d compute instance %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil, pinfo.GpuInstanceId, pinfo.ComputeInstanceId) } } } diff --git a/pkg/sensors/accelerator/gpu/source/nvml_util.go b/pkg/sensors/accelerator/gpu/source/nvml_util.go new file mode 100644 index 0000000000..6ef8b70b53 --- /dev/null +++ b/pkg/sensors/accelerator/gpu/source/nvml_util.go @@ -0,0 +1,145 @@ +//go:build gpu +// +build gpu + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package source + +import ( + "encoding/xml" + "fmt" + "os/exec" + + "k8s.io/klog/v2" +) + +type NvidiaSmiLog struct { + XMLName xml.Name `xml:"nvidia_smi_log"` + Timestamp string `xml:"timestamp,omitempty"` + DriverVersion string `xml:"driver_version,omitempty"` + CudaVersion string `xml:"cuda_version,omitempty"` + AttachedGPUs int `xml:"attached_gpus,omitempty"` + GPU []GPU `xml:"gpu"` +} + +type GPU struct { + ID string `xml:"id,attr"` + MigMode MigMode `xml:"mig_mode,omitempty"` + MigDevices []MigDevice `xml:"mig_devices>mig_device,omitempty"` + UUID string `xml:"uuid,omitempty"` +} + +type MigMode struct { + CurrentMig string `xml:"current_mig,omitempty"` + PendingMig string `xml:"pending_mig,omitempty"` +} + +type MigDevice struct { + Index int `xml:"index,omitempty"` + GPUInstanceID int `xml:"gpu_instance_id,omitempty"` + ComputeInstanceID int `xml:"compute_instance_id,omitempty"` + DeviceAttributes DeviceAttributes `xml:"device_attributes,omitempty"` + ECCErrorCount ECCErrorCount `xml:"ecc_error_count,omitempty"` + FBMemoryUsage MemoryUsage `xml:"fb_memory_usage,omitempty"` + Bar1MemoryUsage MemoryUsage `xml:"bar1_memory_usage,omitempty"` + EntityName string // this is set later + MultiprocessorCountRatio float64 // this is set later +} + +type DeviceAttributes struct { + Shared SharedAttributes `xml:"shared,omitempty"` +} + +type SharedAttributes struct { + MultiprocessorCount int `xml:"multiprocessor_count,omitempty"` + CopyEngineCount int `xml:"copy_engine_count,omitempty"` + EncoderCount int `xml:"encoder_count,omitempty"` + DecoderCount int `xml:"decoder_count,omitempty"` + OFACount int `xml:"ofa_count,omitempty"` + JPGCount int `xml:"jpg_count,omitempty"` +} + +type ECCErrorCount struct { + VolatileCount VolatileCount `xml:"volatile_count,omitempty"` +} + +type VolatileCount struct { + SRAMUncorrectable int `xml:"sram_uncorrectable,omitempty"` +} + +type MemoryUsage struct { + Total string `xml:"total,omitempty"` + Reserved string `xml:"reserved,omitempty"` + Used string `xml:"used,omitempty"` + Free string `xml:"free,omitempty"` +} + +// RetriveFromNvidiaSMI retrives the MIG information from nvidia-smi +func RetriveFromNvidiaSMI(debug bool) (gpuMigArray [][]MigDevice, totalMultiProcessorCount map[string]int, err error) { + cmd := exec.Command("nvidia-smi", "-q", "-x") + output, err := cmd.Output() + if err != nil { + err = fmt.Errorf("Error running nvidia-smi command:", err) + return + } + + var nvidiaSmiLog NvidiaSmiLog + err = xml.Unmarshal(output, &nvidiaSmiLog) + if err != nil { + err = fmt.Errorf("Error unmarshaling XML:", err) + return + } + + gpuMigArray = make([][]MigDevice, len(nvidiaSmiLog.GPU)) + totalMultiProcessorCount = make(map[string]int, len(nvidiaSmiLog.GPU)) + for i, gpu := range nvidiaSmiLog.GPU { + // find the largest GPUInstanceID among the MIGDevices, to make sure we have enough space in the array + maxGPUInstanceID := 0 + for _, migDevice := range gpu.MigDevices { + if migDevice.GPUInstanceID > maxGPUInstanceID { + maxGPUInstanceID = migDevice.GPUInstanceID + } + } + gpuMigArray[i] = make([]MigDevice, maxGPUInstanceID+1) + totalMultiProcessorCount[gpu.UUID] = 0 + for _, migDevice := range gpu.MigDevices { + gpuMigArray[i][migDevice.GPUInstanceID] = migDevice + totalMultiProcessorCount[gpu.UUID] += migDevice.DeviceAttributes.Shared.MultiprocessorCount + } + // count MultiprocessorCountRatio for each device + for j, migDevice := range gpuMigArray[i] { + gpuMigArray[i][j].MultiprocessorCountRatio = float64(migDevice.DeviceAttributes.Shared.MultiprocessorCount) / float64(totalMultiProcessorCount[gpu.UUID]) + } + } + + if debug { + for i, gpu := range nvidiaSmiLog.GPU { + for _, device := range gpuMigArray[i] { + klog.Infof("GPU %d %q", i, gpu.UUID) + klog.Infof("\tIndex: %d\n", device.Index) + klog.Infof("\tGPUInstanceID: %d\n", device.GPUInstanceID) + klog.Infof("\tComputeInstanceID: %d\n", device.ComputeInstanceID) + klog.Infof("\tShared MultiprocessorCount: %d\n", device.DeviceAttributes.Shared.MultiprocessorCount) + klog.Infof("\tShared MultiprocessorCountRatio: %f\n", device.MultiprocessorCountRatio) + klog.Infof("\tFBMemoryUsage Total: %s\n", device.FBMemoryUsage.Total) + klog.Infof("\tFBMemoryUsage Reserved: %s\n", device.FBMemoryUsage.Reserved) + klog.Infof("\tFBMemoryUsage Used: %s\n", device.FBMemoryUsage.Used) + klog.Infof("\tFBMemoryUsage Free: %s\n", device.FBMemoryUsage.Free) + } + } + } + return +} diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index ffe7d38685..0166131d3c 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -49,6 +49,7 @@ const ( SystemProcessNamespace string = "system" EmptyString string = "" GenericSocketID string = "socket0" + GenericGPUID string = "gpu" ) func GetPathFromPID(searchPath string, pid uint64) (string, error) { diff --git a/vendor/modules.txt b/vendor/modules.txt index 717873c366..cbe311bb66 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,3 +1,9 @@ +# github.com/Masterminds/semver v1.5.0 +## explicit +github.com/Masterminds/semver +# github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f +## explicit; go 1.21 +github.com/NVIDIA/go-dcgm/pkg/dcgm # github.com/NVIDIA/go-nvml v0.12.0-1 ## explicit; go 1.15 github.com/NVIDIA/go-nvml/pkg/dl @@ -11,6 +17,9 @@ github.com/aquasecurity/libbpfgo # github.com/beorn7/perks v1.0.1 ## explicit; go 1.11 github.com/beorn7/perks/quantile +# github.com/bits-and-blooms/bitset v1.13.0 +## explicit; go 1.16 +github.com/bits-and-blooms/bitset # github.com/cespare/xxhash/v2 v2.2.0 ## explicit; go 1.11 github.com/cespare/xxhash/v2 @@ -248,6 +257,9 @@ github.com/opencontainers/runtime-spec/specs-go # github.com/pkg/errors v0.9.1 ## explicit github.com/pkg/errors +# github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 +## explicit +github.com/pmezard/go-difflib/difflib # github.com/prometheus/client_golang v1.17.0 ## explicit; go 1.19 github.com/prometheus/client_golang/api @@ -283,6 +295,9 @@ github.com/sirupsen/logrus # github.com/spf13/pflag v1.0.5 ## explicit; go 1.12 github.com/spf13/pflag +# github.com/stretchr/testify v1.8.4 +## explicit; go 1.20 +github.com/stretchr/testify/assert # golang.org/x/exp v0.0.0-20231006140011-7918f672742d ## explicit; go 1.20 golang.org/x/exp/constraints