Skip to content

Commit

Permalink
metrics: support gpu memory usage based ratio, when process utilizati…
Browse files Browse the repository at this point in the history
…on is unavailable

Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs committed Feb 20, 2024
1 parent 72fde6a commit 973f761
Show file tree
Hide file tree
Showing 98 changed files with 33,581 additions and 57 deletions.
20 changes: 11 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ else
GC_FLAGS =
endif

GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo gpu libbpf '
GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo libbpf '
GPU_TAGS := ' gpu '
GO_LD_FLAGS := $(GC_FLAGS) -ldflags "-X $(LD_FLAGS)" $(CFLAGS)

# set GOENV
Expand All @@ -53,7 +54,8 @@ GOENV = GO111MODULE="" GOOS=$(GOOS) GOARCH=$(GOARCH) CGO_ENABLED=1 CC=clang CGO_

DOCKERFILE := $(SRC_ROOT)/build/Dockerfile
IMAGE_BUILD_TAG := $(SOURCE_GIT_TAG)-linux-$(GOARCH)
GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)
GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)$(GPU_TAGS)
GO_TEST_TAGS := $(GENERAL_TAGS)$(GOOS)

# for testsuite
ENVTEST_ASSETS_DIR=$(SRC_ROOT)/test-bin
Expand Down Expand Up @@ -214,28 +216,28 @@ container_test:
make test-container-verbose'

test: ginkgo-set tidy-vendor
@echo TAGS=$(GO_BUILD_TAGS)
@$(GOENV) go test -tags $(GO_BUILD_TAGS) ./... --race --bench=. -cover --count=1 --vet=all
@echo TAGS=$(GO_TEST_TAGS)
@$(GOENV) go test -tags $(GO_TEST_TAGS) ./... --race --bench=. -cover --count=1 --vet=all -v

test-verbose: ginkgo-set tidy-vendor
@echo TAGS=$(GO_BUILD_TAGS)
@echo TAGS=$(GO_TEST_TAGS)
@echo GOENV=$(GOENV)
@$(GOENV) go test -tags $(GO_BUILD_TAGS) \
@$(GOENV) go test -tags $(GO_TEST_TAGS) \
-timeout=30m \
-covermode=atomic -coverprofile=coverage.out \
-v $$(go list ./... | grep pkg | grep -v bpfassets) \
--race --bench=. -cover --count=1 --vet=all

test-container-verbose: ginkgo-set tidy-vendor
@echo TAGS=$(GO_BUILD_TAGS)
@echo TAGS=$(GO_TEST_TAGS)
@echo GOENV=$(GOENV)
@$(GOENV) go test -tags $(GO_BUILD_TAGS) \
@$(GOENV) go test -tags $(GO_TEST_TAGS) \
-covermode=atomic -coverprofile=coverage.out \
-v $$(go list ./... | grep pkg | grep -v bpfassets) \
--race -cover --count=1 --vet=all

test-mac-verbose: ginkgo-set
@echo TAGS=$(GO_BUILD_TAGS)
@echo TAGS=$(GO_TEST_TAGS)
@go test $$(go list ./... | grep pkg | grep -v bpfassets) --race --bench=. -cover --count=1 --vet=all

escapes_detect: tidy-vendor
Expand Down
Binary file modified bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o
Binary file not shown.
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/sustainable-computing-io/kepler
go 1.20

require (
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
github.com/NVIDIA/go-nvml v0.12.0-1
github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0
github.com/containerd/cgroups v1.1.0
Expand All @@ -29,8 +30,10 @@ require (
)

require (
github.com/Masterminds/semver v1.5.0 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.13.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cilium/ebpf v0.9.1 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
Expand Down Expand Up @@ -67,9 +70,11 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/testify v1.8.4 // indirect
golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.13.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
Expand All @@ -7,6 +11,8 @@ github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0/go.mod h1:UD3Mfr+JZ/ASK2VMu
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4=
Expand Down Expand Up @@ -130,6 +136,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
Expand All @@ -156,6 +163,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
Expand Down
2 changes: 1 addition & 1 deletion pkg/bpfassets/attacher/libbpf_attacher.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ var (
uint64Key uint64
maxRetry = config.MaxLookupRetry
bpfArrays = []string{
"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_event_reader",
"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_ms_event_reader",
"cpu_cycles", "cpu_ref_cycles", "cpu_instructions", "cache_miss", "cpu_freq_array", "task_clock",
}
cpuCores = getCPUCores()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats
var err error
var processesUtilization map[uint32]gpu_source.ProcessUtilizationSample
// calculate the gpu's processes energy consumption for each gpu
for _, device := range gpu.GetGpus() {
if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, time.Since(lastUtilizationTimestamp)); err != nil {
for gpuID, device := range gpu.GetGpus() {
if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, gpuID, time.Since(lastUtilizationTimestamp)); err != nil {
klog.Infoln(err)
continue
}
Expand Down Expand Up @@ -78,8 +78,9 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats
}
processStats[uintPid] = stats.NewProcessStats(uintPid, uint64(0), containerID, vmID, command)
}
processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.SmUtil))
processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.MemUtil))
gpuName := fmt.Sprintf("%s%v", utils.GenericGPUID, gpuID)
processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(gpuName, uint64(processUtilization.SmUtil))
processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuName, uint64(processUtilization.MemUtil))
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func NewStats() *Stats {
m.ResourceUsage[metricName] = types.NewUInt64StatCollection()
}

if gpu.IsGPUCollectionSupported() {
if config.EnabledGPU {
m.ResourceUsage[config.GPUSMUtilization] = types.NewUInt64StatCollection()
m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection()
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,7 @@ func IsCgroupMetricsEnabled() bool {
func IsIRQCounterMetricsEnabled() bool {
return ExposeIRQCounterMetrics
}

func SetGpuUsageMetric(metric string) {
GpuUsageMetric = metric
}
5 changes: 5 additions & 0 deletions pkg/metrics/consts/conts.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
ContainerResUtilLabels = []string{"container_id", "pod_name", "container_name", "container_namespace"}
VMResUtilLabels = []string{"vm_id"}
NodeResUtilLabels = []string{"device", "instance"}
GPUResUtilLabels = []string{"gpu_id"}
)

var (
Expand Down Expand Up @@ -90,4 +91,8 @@ var (
config.CgroupfsSystemCPU,
config.CgroupfsUserCPU,
}
GPUMetricNames = []string{
config.GPUSMUtilization,
config.GPUMemUtilization,
}
)
4 changes: 4 additions & 0 deletions pkg/metrics/container/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ func (c *collector) initMetrics() {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}

desc := metricfactory.MetricsPromDesc(context, "joules", "_total", "", consts.ContainerEnergyLabels)
c.descriptions["total"] = desc
Expand Down
20 changes: 18 additions & 2 deletions pkg/metrics/metricfactory/metric_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,16 @@ func NodeCPUFrequencyMetricsPromDesc(context string) (descriptions map[string]*p
return descriptions
}

func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) {
descriptions = make(map[string]*prometheus.Desc)
if config.EnabledGPU {
for _, name := range consts.GPUMetricNames {
descriptions[name] = resMetricsPromDesc(context, name, "nvidia-nvml")
}
}
return descriptions
}

func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) {
var labels []string
switch context {
Expand All @@ -129,12 +139,18 @@ func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) {
klog.Errorf("Unexpected prometheus context: %s", context)
return
}
// if this is a GPU metric, we need to add the GPU ID label
for _, gpuMetric := range consts.GPUMetricNames {
if name == gpuMetric {
labels = append(labels, consts.GPUResUtilLabels...)
}
}
return MetricsPromDesc(context, name, consts.UsageMetricNameSuffix, source, labels)
}

func MetricsPromDesc(context, name, sufix, source string, labels []string) (desc *prometheus.Desc) {
func MetricsPromDesc(context, name, suffix, source string, labels []string) (desc *prometheus.Desc) {
return prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, context, name+sufix),
prometheus.BuildFQName(consts.MetricsNamespace, context, name+suffix),
"Aggregated value in "+name+" value from "+source,
labels,
prometheus.Labels{"source": source},
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/process/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ func (c *collector) initMetrics() {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
}

func (c *collector) Describe(ch chan<- *prometheus.Desc) {
Expand Down
28 changes: 25 additions & 3 deletions pkg/metrics/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
}

if config.EnabledGPU {
for _, collectorName := range consts.GPUMetricNames {
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
}
}

func collect(ch chan<- prometheus.Metric, collector metricfactory.PromMetric, value float64, labelValues []string) {
Expand Down Expand Up @@ -116,9 +122,25 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam
switch v := instance.(type) {
case *stats.ContainerStats:
container := instance.(*stats.ContainerStats)
value = float64(container.ResourceUsage[metricName].SumAllAggrValues())
labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace}
collect(ch, collector, value, labelValues)
// special case for GPU devices, the metrics are reported per device
isGPUMetric := false
for _, m := range consts.GPUMetricNames {
if metricName == m {
isGPUMetric = true
break
}
}
if isGPUMetric {
for deviceID, utilization := range container.ResourceUsage[metricName].Stat {
value = float64(utilization.Aggr)
labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace, deviceID}
collect(ch, collector, value, labelValues)
}
} else {
value = float64(container.ResourceUsage[metricName].SumAllAggrValues())
labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace}
collect(ch, collector, value, labelValues)
}

case *stats.ProcessStats:
process := instance.(*stats.ProcessStats)
Expand Down
12 changes: 11 additions & 1 deletion pkg/sensors/accelerator/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,18 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp

// init initialize the acceleratorImpl and start it
func init() {
acceleratorImpl = &gpu_source.GPUNvml{}
acceleratorImpl = &gpu_source.GPUDcgm{}
err := acceleratorImpl.Init()
if err == nil {
klog.Infoln("Using dcgm to obtain gpu power")
// If the library was successfully initialized, we don't need to return an error in the Init() function
errLib = nil
return
}
// if dcgm fail to work, we use nvml
klog.Infof("Failed to init dcgm, err: %v\n", err)
acceleratorImpl = &gpu_source.GPUNvml{}
err = acceleratorImpl.Init()
if err == nil {
klog.Infoln("Using nvml to obtain gpu power")
// If the library was successfully initialized, we don't need to return an error in the Init() function
Expand Down
12 changes: 6 additions & 6 deletions pkg/sensors/accelerator/gpu/power.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ type acceleratorInterface interface {
// Shutdown stops the GPU metric collector
Shutdown() bool
// GetGpus returns a map with gpu device
GetGpus() []interface{}
GetGpus() map[string]interface{}
// GetAbsEnergyFromGPU returns a map with mJ in each gpu device. Absolute energy is the sum of Idle + Dynamic energy.
GetAbsEnergyFromGPU() []uint32
// GetProcessResourceUtilization returns a map of ProcessUtilizationSample where the key is the process pid
GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error)
GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error)
// IsGPUCollectionSupported returns if it is possible to use this collector
IsGPUCollectionSupported() bool
// SetGPUCollectionSupported manually set if it is possible to use this collector. This is for testing purpose only.
Expand All @@ -65,11 +65,11 @@ func Shutdown() bool {
return true
}

func GetGpus() []interface{} {
func GetGpus() map[string]interface{} {
if acceleratorImpl != nil && config.EnabledGPU {
return acceleratorImpl.GetGpus()
}
return []interface{}{}
return map[string]interface{}{}
}

func GetAbsEnergyFromGPU() []uint32 {
Expand All @@ -82,9 +82,9 @@ func GetAbsEnergyFromGPU() []uint32 {
// GetProcessResourceUtilizationPerDevice tries to collect the GPU metrics.
// There is a known issue that some clusters the nvidia GPU can stop to respod and we need to start it again.
// See https://github.com/sustainable-computing-io/kepler/issues/610.
func GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) {
func GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) {
if acceleratorImpl != nil && config.EnabledGPU {
processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, since)
processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, deviceName, since)
if err != nil {
klog.Infof("Failed to collect GPU metrics, trying to initizalize again: %v\n", err)
err = acceleratorImpl.Init()
Expand Down
Loading

0 comments on commit 973f761

Please sign in to comment.