diff --git a/charts/vantage-kubernetes-agent/Chart.yaml b/charts/vantage-kubernetes-agent/Chart.yaml index e95723c..2ff5ef6 100644 --- a/charts/vantage-kubernetes-agent/Chart.yaml +++ b/charts/vantage-kubernetes-agent/Chart.yaml @@ -2,6 +2,6 @@ apiVersion: v2 name: vantage-kubernetes-agent description: Provisions the Vantage Kubernetes agent. type: application -version: 1.0.33 -appVersion: "1.0.25" +version: 1.0.34 +appVersion: "1.0.26" icon: "https://assets.vantage.sh/www/vantage_avatar-social.jpg" diff --git a/charts/vantage-kubernetes-agent/templates/application.yaml b/charts/vantage-kubernetes-agent/templates/application.yaml index c5e929f..34a612f 100644 --- a/charts/vantage-kubernetes-agent/templates/application.yaml +++ b/charts/vantage-kubernetes-agent/templates/application.yaml @@ -93,6 +93,26 @@ spec: - name: VANTAGE_REPORT_HTTP_PROXY value: "{{ .Values.agent.reportHTTPProxy }}" {{- end}} + {{- if .Values.agent.gpu.usageMetrics}} + - name: VANTAGE_GPU_METRICS + value: "true" + {{- end}} + {{- if .Values.agent.gpu.exporterNamespace}} + - name: VANTAGE_GPU_EXPORTER_NAMESPACE + value: "{{ .Values.agent.gpu.exporterNamespace }}" + {{- end}} + {{- if .Values.agent.gpu.exporterServiceName}} + - name: VANTAGE_GPU_EXPORTER_SERVICE_NAME + value: "{{ .Values.agent.gpu.exporterServiceName }}" + {{- end}} + {{- if .Values.agent.gpu.exporterPortName}} + - name: VANTAGE_GPU_EXPORTER_PORT_NAME + value: "{{ .Values.agent.gpu.exporterPortName }}" + {{- end}} + {{- if .Values.agent.gpu.exporterPath}} + - name: VANTAGE_GPU_EXPORTER_PATH + value: "{{ .Values.agent.gpu.exporterPath }}" + {{- end}} - name: VANTAGE_API_TOKEN valueFrom: secretKeyRef: diff --git a/charts/vantage-kubernetes-agent/templates/clusterrole.yaml b/charts/vantage-kubernetes-agent/templates/clusterrole.yaml index 35f7eff..b0043e0 100644 --- a/charts/vantage-kubernetes-agent/templates/clusterrole.yaml +++ b/charts/vantage-kubernetes-agent/templates/clusterrole.yaml @@ -33,6 +33,12 @@ rules: - "jobs" - "cronjobs" verbs: ["get", "watch", "list"] +{{- if .Values.agent.gpu.usageMetrics}} +- apiGroups: [""] + resources: + - "endpoints" + verbs: ["get", "watch", "list"] +{{- end}} {{- if .Values.agent.argocdRollouts}} - apiGroups: ["argoproj.io"] resources: diff --git a/charts/vantage-kubernetes-agent/values.schema.json b/charts/vantage-kubernetes-agent/values.schema.json index fd9944d..da13375 100644 --- a/charts/vantage-kubernetes-agent/values.schema.json +++ b/charts/vantage-kubernetes-agent/values.schema.json @@ -26,6 +26,26 @@ "disableKubeTLSverify": { "type": "string" }, + "gpu": { + "type": "object", + "properties": { + "exporterNamespace": { + "type": "string" + }, + "exporterPath": { + "type": "string" + }, + "exporterPortName": { + "type": "string" + }, + "exporterServiceName": { + "type": "string" + }, + "usageMetrics": { + "type": "boolean" + } + } + }, "logLevel": { "type": "string" }, diff --git a/charts/vantage-kubernetes-agent/values.yaml b/charts/vantage-kubernetes-agent/values.yaml index c0b913f..4fc6434 100644 --- a/charts/vantage-kubernetes-agent/values.yaml +++ b/charts/vantage-kubernetes-agent/values.yaml @@ -44,6 +44,22 @@ agent: # Optional. URL of an HTTP proxy used for external communications during the periodic report. This includes HTTP requests to the Vantage API and AWS S3. ie. http://example.com:3001 reportHTTPProxy: "" + gpu: + # Optional. Whether the agent should scrape GPU metrics or not. Requires NVIDIA dcgm-exporter to be deployed and configured with DCGM_FI_DEV_FB_USED and DCGM_FI_DEV_FB_TOTAL + usageMetrics: false + # Optional. The namespace the GPU exporter is deployed in. + # Uses agent default if not specified: "gpu-operator" + exporterNamespace: "" + # Optional. The service name for the GPU exporter. Used to lookup the endpoints. + # Uses agent default if not specified: "nvidia-dcgm-exporter" + exporterServiceName: "" + # Optional. The port name for the metrics port of the gpu exporter. + # Uses agent default if not specified: "gpu-metrics" + exporterPortName: "" + # Optional. The path the metrics endpoint is available under. + # Uses agent default if not specified: "/metrics" + exporterPath: "" + persist: mountPath: "/var/lib/vantage-agent"