Skip to content

Commit

Permalink
fix #1233: track process usage on each GPU device
Browse files Browse the repository at this point in the history
Signed-off-by: Huamin Chen <[email protected]>

add gpu uuid

Signed-off-by: Huamin Chen <[email protected]>

add nvidia-smi parser to get MIG info

Signed-off-by: Huamin Chen <[email protected]>

add dcgm

Signed-off-by: Huamin Chen <[email protected]>

update

Signed-off-by: Huamin Chen <[email protected]>

add counter

Signed-off-by: Huamin Chen <[email protected]>

add counter

Signed-off-by: Huamin Chen <[email protected]>

update gpu counter

Signed-off-by: Huamin Chen <[email protected]>

update gpu counter

Signed-off-by: Huamin Chen <[email protected]>

update gpu startup sequence

Signed-off-by: Huamin Chen <[email protected]>

refactor gpu interface; retrieve GPU hierachy and calculate SM ratio

Signed-off-by: Huamin Chen <[email protected]>

tracking processes in MIG and parent GPU

Signed-off-by: Huamin Chen <[email protected]>

calculate process util based on SM ratio

Signed-off-by: Huamin Chen <[email protected]>

update

Signed-off-by: Huamin Chen <[email protected]>

calculate process util based on SM ratio

Signed-off-by: Huamin Chen <[email protected]>

refactor

Signed-off-by: Huamin Chen <[email protected]>

fix wrong name

Signed-off-by: Huamin Chen <[email protected]>

review feedback

Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs committed Feb 16, 2024
1 parent e373b16 commit 3a06f00
Show file tree
Hide file tree
Showing 14 changed files with 611 additions and 29 deletions.
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/sustainable-computing-io/kepler
go 1.20

require (
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
github.com/NVIDIA/go-nvml v0.12.0-1
github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0
github.com/containerd/cgroups v1.1.0
Expand All @@ -29,8 +30,10 @@ require (
)

require (
github.com/Masterminds/semver v1.5.0 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.13.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cilium/ebpf v0.9.1 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
Expand Down Expand Up @@ -67,9 +70,11 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/testify v1.8.4 // indirect
golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.13.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
Expand All @@ -7,6 +11,8 @@ github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0/go.mod h1:UD3Mfr+JZ/ASK2VMu
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4=
Expand Down Expand Up @@ -130,6 +136,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
Expand All @@ -156,6 +163,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats
var err error
var processesUtilization map[uint32]gpu_source.ProcessUtilizationSample
// calculate the gpu's processes energy consumption for each gpu
for _, device := range gpu.GetGpus() {
if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, time.Since(lastUtilizationTimestamp)); err != nil {
for gpuID, device := range gpu.GetGpus() {
if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, gpuID, time.Since(lastUtilizationTimestamp)); err != nil {
klog.Infoln(err)
continue
}
Expand Down Expand Up @@ -78,8 +78,9 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats
}
processStats[uintPid] = stats.NewProcessStats(uintPid, uint64(0), containerID, vmID, command)
}
processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.SmUtil))
processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.MemUtil))
gpuName := fmt.Sprintf("%s%v", utils.GenericGPUID, gpuID)
processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(gpuName, uint64(processUtilization.SmUtil))
processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuName, uint64(processUtilization.MemUtil))
}
}

Expand Down
1 change: 1 addition & 0 deletions pkg/metrics/consts/conts.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
ContainerResUtilLabels = []string{"container_id", "pod_name", "container_name", "container_namespace"}
VMResUtilLabels = []string{"vm_id"}
NodeResUtilLabels = []string{"device", "instance"}
GPUResUtilLabels = []string{"gpu_id"}
)

var (
Expand Down
10 changes: 8 additions & 2 deletions pkg/metrics/metricfactory/metric_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,18 @@ func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) {
klog.Errorf("Unexpected prometheus context: %s", context)
return
}
// if this is a GPU metric, we need to add the GPU ID label
for _, gpuMetric := range consts.GPUMetricNames {
if name == gpuMetric {
labels = append(labels, consts.GPUResUtilLabels...)
}
}
return MetricsPromDesc(context, name, consts.UsageMetricNameSuffix, source, labels)
}

func MetricsPromDesc(context, name, sufix, source string, labels []string) (desc *prometheus.Desc) {
func MetricsPromDesc(context, name, suffix, source string, labels []string) (desc *prometheus.Desc) {
return prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, context, name+sufix),
prometheus.BuildFQName(consts.MetricsNamespace, context, name+suffix),
"Aggregated value in "+name+" value from "+source,
labels,
prometheus.Labels{"source": source},
Expand Down
22 changes: 19 additions & 3 deletions pkg/metrics/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,25 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam
switch v := instance.(type) {
case *stats.ContainerStats:
container := instance.(*stats.ContainerStats)
value = float64(container.ResourceUsage[metricName].SumAllAggrValues())
labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace}
collect(ch, collector, value, labelValues)
// special case for GPU devices, the metrics are reported per device
isGPUMetric := false
for _, m := range consts.GPUMetricNames {
if metricName == m {
isGPUMetric = true
break
}
}
if isGPUMetric {
for deviceID, utilization := range container.ResourceUsage[metricName].Stat {
value = float64(utilization.Aggr)
labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace, deviceID}
collect(ch, collector, value, labelValues)
}
} else {
value = float64(container.ResourceUsage[metricName].SumAllAggrValues())
labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace}
collect(ch, collector, value, labelValues)
}

case *stats.ProcessStats:
process := instance.(*stats.ProcessStats)
Expand Down
12 changes: 11 additions & 1 deletion pkg/sensors/accelerator/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,18 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp

// init initialize the acceleratorImpl and start it
func init() {
acceleratorImpl = &gpu_source.GPUNvml{}
acceleratorImpl = &gpu_source.GPUDcgm{}
err := acceleratorImpl.Init()
if err == nil {
klog.Infoln("Using dcgm to obtain gpu power")
// If the library was successfully initialized, we don't need to return an error in the Init() function
errLib = nil
return
}
// if dcgm fail to work, we use nvml
klog.Infof("Failed to init dcgm, err: %v\n", err)
acceleratorImpl = &gpu_source.GPUNvml{}
err = acceleratorImpl.Init()
if err == nil {
klog.Infoln("Using nvml to obtain gpu power")
// If the library was successfully initialized, we don't need to return an error in the Init() function
Expand Down
12 changes: 6 additions & 6 deletions pkg/sensors/accelerator/gpu/power.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ type acceleratorInterface interface {
// Shutdown stops the GPU metric collector
Shutdown() bool
// GetGpus returns a map with gpu device
GetGpus() []interface{}
GetGpus() map[string]interface{}
// GetAbsEnergyFromGPU returns a map with mJ in each gpu device. Absolute energy is the sum of Idle + Dynamic energy.
GetAbsEnergyFromGPU() []uint32
// GetProcessResourceUtilization returns a map of ProcessUtilizationSample where the key is the process pid
GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error)
GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error)
// IsGPUCollectionSupported returns if it is possible to use this collector
IsGPUCollectionSupported() bool
// SetGPUCollectionSupported manually set if it is possible to use this collector. This is for testing purpose only.
Expand All @@ -65,11 +65,11 @@ func Shutdown() bool {
return true
}

func GetGpus() []interface{} {
func GetGpus() map[string]interface{} {
if acceleratorImpl != nil && config.EnabledGPU {
return acceleratorImpl.GetGpus()
}
return []interface{}{}
return map[string]interface{}{}
}

func GetAbsEnergyFromGPU() []uint32 {
Expand All @@ -82,9 +82,9 @@ func GetAbsEnergyFromGPU() []uint32 {
// GetProcessResourceUtilizationPerDevice tries to collect the GPU metrics.
// There is a known issue that some clusters the nvidia GPU can stop to respod and we need to start it again.
// See https://github.com/sustainable-computing-io/kepler/issues/610.
func GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) {
func GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) {
if acceleratorImpl != nil && config.EnabledGPU {
processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, since)
processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, deviceName, since)
if err != nil {
klog.Infof("Failed to collect GPU metrics, trying to initizalize again: %v\n", err)
err = acceleratorImpl.Init()
Expand Down
Loading

0 comments on commit 3a06f00

Please sign in to comment.