Skip to content

Commit

Permalink
NVIDIA Mig support (#1238)
Browse files Browse the repository at this point in the history
* metrics: support gpu memory usage based ratio, when process utilization is unavailable

Signed-off-by: Huamin Chen <[email protected]>

* review feedback

Signed-off-by: Huamin Chen <[email protected]>

* add dcgm to kepler image

Signed-off-by: Huamin Chen <[email protected]>

* gpu: switch to dcgm standalone mode due to containerization limitations

Signed-off-by: Huamin Chen <[email protected]>

* fix yaml

Signed-off-by: Huamin Chen <[email protected]>

---------

Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs authored Feb 21, 2024
1 parent be639f1 commit 87228e6
Show file tree
Hide file tree
Showing 106 changed files with 33,692 additions and 97 deletions.
15 changes: 13 additions & 2 deletions .github/workflows/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
username: ${{ secrets.username }}
password: ${{ secrets.password }}

- name: Build and push kepler latest (libbpf)
- name: Build and push kepler latest (no dcgm)
uses: docker/build-push-action@v5
with:
context: .
Expand All @@ -43,6 +43,17 @@ jobs:
labels: ${{ inputs.imageTag }}
file: build/Dockerfile

- name: Build and push kepler latest (with dcgm)
uses: docker/build-push-action@v5
with:
context: .
platforms: linux/amd64
build-args: INSTALL_DCGM="true"
push: ${{ inputs.pushImage }}
tags: quay.io/sustainable_computing_io/kepler:${{ inputs.imageTag }}-dgcm
labels: ${{ inputs.imageTag }}-dgcm
file: build/Dockerfile

- name: Build and push kepler-validator to official group repo
uses: docker/build-push-action@v5
with:
Expand All @@ -65,4 +76,4 @@ jobs:
with:
name: sbom-kepler-${{ inputs.imageTag }}.json
path: sbom-kepler-${{ inputs.imageTag }}.json
retention-days: 1
retention-days: 1
34 changes: 24 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ else
GC_FLAGS =
endif

GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo gpu libbpf '
GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo libbpf '
GPU_TAGS := ' gpu '
GO_LD_FLAGS := $(GC_FLAGS) -ldflags "-X $(LD_FLAGS)" $(CFLAGS)

# set GOENV
Expand All @@ -53,7 +54,8 @@ GOENV = GO111MODULE="" GOOS=$(GOOS) GOARCH=$(GOARCH) CGO_ENABLED=1 CC=clang CGO_

DOCKERFILE := $(SRC_ROOT)/build/Dockerfile
IMAGE_BUILD_TAG := $(SOURCE_GIT_TAG)-linux-$(GOARCH)
GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)
GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)$(GPU_TAGS)
GO_TEST_TAGS := $(GENERAL_TAGS)$(GOOS)

# for testsuite
ENVTEST_ASSETS_DIR=$(SRC_ROOT)/test-bin
Expand Down Expand Up @@ -86,7 +88,7 @@ clean: clean-cross-build
build_containerized: tidy-vendor format
@if [ -z '$(CTR_CMD)' ] ; then echo '!! ERROR: containerized builds require podman||docker CLI, none found $$PATH' >&2 && exit 1; fi
echo BIN_TIMESTAMP==$(BIN_TIMESTAMP)

# build kepler without dcgm
$(CTR_CMD) build -t $(IMAGE_REPO)/$(IMAGE_NAME):$(IMAGE_BUILD_TAG) \
-f $(DOCKERFILE) \
--network host \
Expand All @@ -97,6 +99,18 @@ build_containerized: tidy-vendor format

$(CTR_CMD) tag $(IMAGE_REPO)/$(IMAGE_NAME):$(IMAGE_BUILD_TAG) $(IMAGE_REPO)/$(IMAGE_NAME):$(IMAGE_TAG)

# build kepler with dcgm
$(CTR_CMD) build -t $(IMAGE_REPO)/$(IMAGE_NAME):$(IMAGE_BUILD_TAG)-"dcgm" \
-f $(DOCKERFILE) \
--network host \
--build-arg SOURCE_GIT_TAG=$(SOURCE_GIT_TAG) \
--build-arg BIN_TIMESTAMP=$(BIN_TIMESTAMP) \
--build-arg INSTALL_DCGM="true" \
--platform="linux/$(GOARCH)" \
.

$(CTR_CMD) tag $(IMAGE_REPO)/$(IMAGE_NAME):$(IMAGE_BUILD_TAG)-dcgm $(IMAGE_REPO)/$(IMAGE_NAME):$(IMAGE_TAG)-dcgm

.PHONY: build_containerized

save-image:
Expand Down Expand Up @@ -214,28 +228,28 @@ container_test:
make test-container-verbose'

test: ginkgo-set tidy-vendor
@echo TAGS=$(GO_BUILD_TAGS)
@$(GOENV) go test -tags $(GO_BUILD_TAGS) ./... --race --bench=. -cover --count=1 --vet=all
@echo TAGS=$(GO_TEST_TAGS)
@$(GOENV) go test -tags $(GO_TEST_TAGS) ./... --race --bench=. -cover --count=1 --vet=all -v

test-verbose: ginkgo-set tidy-vendor
@echo TAGS=$(GO_BUILD_TAGS)
@echo TAGS=$(GO_TEST_TAGS)
@echo GOENV=$(GOENV)
@$(GOENV) go test -tags $(GO_BUILD_TAGS) \
@$(GOENV) go test -tags $(GO_TEST_TAGS) \
-timeout=30m \
-covermode=atomic -coverprofile=coverage.out \
-v $$(go list ./... | grep pkg | grep -v bpfassets) \
--race --bench=. -cover --count=1 --vet=all

test-container-verbose: ginkgo-set tidy-vendor
@echo TAGS=$(GO_BUILD_TAGS)
@echo TAGS=$(GO_TEST_TAGS)
@echo GOENV=$(GOENV)
@$(GOENV) go test -tags $(GO_BUILD_TAGS) \
@$(GOENV) go test -tags $(GO_TEST_TAGS) \
-covermode=atomic -coverprofile=coverage.out \
-v $$(go list ./... | grep pkg | grep -v bpfassets) \
--race -cover --count=1 --vet=all

test-mac-verbose: ginkgo-set
@echo TAGS=$(GO_BUILD_TAGS)
@echo TAGS=$(GO_TEST_TAGS)
@go test $$(go list ./... | grep pkg | grep -v bpfassets) --race --bench=. -cover --count=1 --vet=all

escapes_detect: tidy-vendor
Expand Down
Binary file modified bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o
Binary file not shown.
16 changes: 13 additions & 3 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,29 @@ COPY . .
RUN ATTACHER_TAG=libbpf make build

FROM registry.access.redhat.com/ubi9:9.2
ARG INSTALL_DCGM
ARG INSTALL_DCGM=${INSTALL_DCGM:-""}

RUN yum -y update
RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
RUN if [ $(uname -i) == "x86_64" ]; then yum install -y cpuid; fi

ENV NVIDIA_VISIBLE_DEVICES=all
# add utility to support nvidia-smi
ENV NVIDIA_DRIVER_CAPABILITIES=utility
ENV NVIDIA_MIG_CONFIG_DEVICES=all
ENV NVIDIA_MIG_MONITOR_DEVICES=all

RUN INSTALL_PKGS=" \
libbpf \
" && \
yum install -y $INSTALL_PKGS && \
yum clean all
yum install -y $INSTALL_PKGS

RUN if [[ ! -z "$INSTALL_DCGM" ]]; then \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
yum install -y datacenter-gpu-manager; \
fi

RUN yum clean all

COPY --from=builder /workspace/_output/bin/kepler /usr/bin/kepler
COPY --from=builder /libbpf-source/linux-5.14.0-333.el9/tools/bpf/bpftool/bpftool /usr/bin/bpftool
Expand Down
3 changes: 2 additions & 1 deletion cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,11 @@ func main() {
// the GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver
// therefore, we wait up to 1 min to allow the gpu operator initialize
for i := 0; i <= maxGPUInitRetry; i++ {
time.Sleep(6 * time.Second)
err = gpu.Init()
if err == nil {
break
} else {
time.Sleep(6 * time.Second)
}
}
if err == nil {
Expand Down
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/sustainable-computing-io/kepler
go 1.20

require (
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
github.com/NVIDIA/go-nvml v0.12.0-1
github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0
github.com/containerd/cgroups v1.1.0
Expand All @@ -29,8 +30,10 @@ require (
)

require (
github.com/Masterminds/semver v1.5.0 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.13.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cilium/ebpf v0.9.1 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
Expand Down Expand Up @@ -67,9 +70,11 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/testify v1.8.4 // indirect
golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.13.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
Expand All @@ -7,6 +11,8 @@ github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0/go.mod h1:UD3Mfr+JZ/ASK2VMu
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4=
Expand Down Expand Up @@ -130,6 +136,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
Expand All @@ -156,6 +163,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
Expand Down
2 changes: 1 addition & 1 deletion pkg/bpfassets/attacher/libbpf_attacher.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ var (
uint64Key uint64
maxRetry = config.MaxLookupRetry
bpfArrays = []string{
"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_event_reader",
"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_ms_event_reader",
"cpu_cycles", "cpu_ref_cycles", "cpu_instructions", "cache_miss", "cpu_freq_array", "task_clock",
}
cpuCores = getCPUCores()
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/energy/node_energy_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup)
// UpdateNodeGPUEnergy updates each GPU power consumption. Right now we don't support other types of accelerators
func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) {
defer wg.Done()
if config.EnabledGPU {
if config.EnabledGPU && gpu.IsGPUCollectionSupported() {
gpuEnergy := gpu.GetAbsEnergyFromGPU()
for gpu, energy := range gpuEnergy {
nodeStats.EnergyUsage[config.AbsEnergyInGPU].SetDeltaStat(fmt.Sprintf("%d", gpu), uint64(energy))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package accelerator
import (
"fmt"
"os"
"strconv"
"time"

"github.com/sustainable-computing-io/kepler/pkg/cgroup"
Expand Down Expand Up @@ -47,12 +46,11 @@ func UpdateProcessGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessSt
var err error
var processesUtilization map[uint32]gpu_source.ProcessUtilizationSample
// calculate the gpu's processes energy consumption for each gpu
for i, device := range gpu.GetGpus() {
if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, time.Since(lastUtilizationTimestamp)); err != nil {
for gpuID, device := range gpu.GetGpus() {
if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, gpuID, time.Since(lastUtilizationTimestamp)); err != nil {
klog.Infoln(err)
continue
}
gpuID := strconv.Itoa(i)

for pid, processUtilization := range processesUtilization {
uintPid := uint64(pid)
Expand Down Expand Up @@ -80,8 +78,9 @@ func UpdateProcessGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessSt
}
processStats[uintPid] = stats.NewProcessStats(uintPid, uint64(0), containerID, vmID, command)
}
processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(gpuID, uint64(processUtilization.SmUtil))
processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuID, uint64(processUtilization.MemUtil))
gpuName := fmt.Sprintf("%s%v", utils.GenericGPUID, gpuID)
processStats[uintPid].ResourceUsage[config.GPUComputeUtilization].AddDeltaStat(gpuName, uint64(processUtilization.ComputeUtil))
processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuName, uint64(processUtilization.MemUtil))
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/node_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func (ne *NodeStats) ResetDeltaValues() {
func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) {
// gpu metric
if config.EnabledGPU && gpu.IsGPUCollectionSupported() {
ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUSMUtilization)
ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization)
}

if isComponentsSystemCollectionSupported {
Expand Down
4 changes: 2 additions & 2 deletions pkg/collector/stats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ func NewStats() *Stats {
m.ResourceUsage[metricName] = types.NewUInt64StatCollection()
}

if gpu.IsGPUCollectionSupported() {
m.ResourceUsage[config.GPUSMUtilization] = types.NewUInt64StatCollection()
if config.EnabledGPU && gpu.IsGPUCollectionSupported() {
m.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection()
m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection()
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func getProcessFeatureNames() []string {

// gpu metric
if config.EnabledGPU && gpu.IsGPUCollectionSupported() {
gpuMetrics := []string{config.GPUSMUtilization, config.GPUMemUtilization}
gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization}
metrics = append(metrics, gpuMetrics...)
klog.V(3).Infof("Available GPU metrics: %v", gpuMetrics)
}
Expand Down
9 changes: 8 additions & 1 deletion pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ var (
CoreUsageMetric = getConfig("CORE_USAGE_METRIC", CPUInstruction)
DRAMUsageMetric = getConfig("DRAM_USAGE_METRIC", CacheMiss)
UncoreUsageMetric = getConfig("UNCORE_USAGE_METRIC", defaultMetricValue) // no metric (evenly divided)
GpuUsageMetric = getConfig("GPU_USAGE_METRIC", GPUSMUtilization) // no metric (evenly divided)
GpuUsageMetric = getConfig("GPU_USAGE_METRIC", GPUComputeUtilization) // no metric (evenly divided)
GeneralUsageMetric = getConfig("GENERAL_USAGE_METRIC", defaultMetricValue) // for uncategorized energy

SamplePeriodSec = uint64(getIntConfig("SAMPLE_PERIOD_SEC", defaultSamplePeriodSec))
Expand All @@ -101,6 +101,9 @@ var (

configPath = "/etc/kepler/kepler.config"

// nvidia dcgm hostengine endpoint
DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555")

// dir of kernel sources for bcc
kernelSourceDirs = []string{}

Expand Down Expand Up @@ -455,3 +458,7 @@ func IsCgroupMetricsEnabled() bool {
func IsIRQCounterMetricsEnabled() bool {
return ExposeIRQCounterMetrics
}

func SetGpuUsageMetric(metric string) {
GpuUsageMetric = metric
}
4 changes: 2 additions & 2 deletions pkg/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ const (
CPUFrequency = "avg_cpu_frequency"

// NVIDIA GPU
GPUSMUtilization = "gpu_sm_util"
GPUMemUtilization = "gpu_mem_util"
GPUComputeUtilization = "gpu_compute_util"
GPUMemUtilization = "gpu_mem_util"

// Intel QuickAssist Technology (QAT)
// TODO: test if different request has different energy consumption.
Expand Down
5 changes: 5 additions & 0 deletions pkg/metrics/consts/conts.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
ContainerResUtilLabels = []string{"container_id", "pod_name", "container_name", "container_namespace"}
VMResUtilLabels = []string{"vm_id"}
NodeResUtilLabels = []string{"device", "instance"}
GPUResUtilLabels = []string{"gpu_id"}
)

var (
Expand Down Expand Up @@ -90,4 +91,8 @@ var (
config.CgroupfsSystemCPU,
config.CgroupfsUserCPU,
}
GPUMetricNames = []string{
config.GPUComputeUtilization,
config.GPUMemUtilization,
}
)
4 changes: 4 additions & 0 deletions pkg/metrics/container/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ func (c *collector) initMetrics() {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}
for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) {
c.descriptions[name] = desc
c.collectors[name] = metricfactory.NewPromCounter(desc)
}

desc := metricfactory.MetricsPromDesc(context, "joules", "_total", "", consts.ContainerEnergyLabels)
c.descriptions["total"] = desc
Expand Down
Loading

0 comments on commit 87228e6

Please sign in to comment.