Skip to content

Commit

Permalink
Add support for GPU exporters.
Browse files Browse the repository at this point in the history
  • Loading branch information
macb committed May 30, 2024
1 parent d81b300 commit 392babe
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 2 deletions.
4 changes: 2 additions & 2 deletions charts/vantage-kubernetes-agent/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ apiVersion: v2
name: vantage-kubernetes-agent
description: Provisions the Vantage Kubernetes agent.
type: application
version: 1.0.33
appVersion: "1.0.25"
version: 1.0.34
appVersion: "1.0.26"
icon: "https://assets.vantage.sh/www/vantage_avatar-social.jpg"
20 changes: 20 additions & 0 deletions charts/vantage-kubernetes-agent/templates/application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,26 @@ spec:
- name: VANTAGE_REPORT_HTTP_PROXY
value: "{{ .Values.agent.reportHTTPProxy }}"
{{- end}}
{{- if .Values.agent.gpu.usageMetrics}}
- name: VANTAGE_GPU_METRICS
value: "true"
{{- end}}
{{- if .Values.agent.gpu.exporterNamespace}}
- name: VANTAGE_GPU_EXPORTER_NAMESPACE
value: "{{ .Values.agent.gpu.exporterNamespace }}"
{{- end}}
{{- if .Values.agent.gpu.exporterServiceName}}
- name: VANTAGE_GPU_EXPORTER_SERVICE_NAME
value: "{{ .Values.agent.gpu.exporterServiceName }}"
{{- end}}
{{- if .Values.agent.gpu.exporterPortName}}
- name: VANTAGE_GPU_EXPORTER_PORT_NAME
value: "{{ .Values.agent.gpu.exporterPortName }}"
{{- end}}
{{- if .Values.agent.gpu.exporterPath}}
- name: VANTAGE_GPU_EXPORTER_PATH
value: "{{ .Values.agent.gpu.exporterPath }}"
{{- end}}
- name: VANTAGE_API_TOKEN
valueFrom:
secretKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions charts/vantage-kubernetes-agent/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ rules:
- "jobs"
- "cronjobs"
verbs: ["get", "watch", "list"]
{{- if .Values.agent.gpu.usageMetrics}}
- apiGroups: [""]
resources:
- "endpoints"
verbs: ["get", "watch", "list"]
{{- end}}
{{- if .Values.agent.argocdRollouts}}
- apiGroups: ["argoproj.io"]
resources:
Expand Down
20 changes: 20 additions & 0 deletions charts/vantage-kubernetes-agent/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,26 @@
"disableKubeTLSverify": {
"type": "string"
},
"gpu": {
"type": "object",
"properties": {
"exporterNamespace": {
"type": "string"
},
"exporterPath": {
"type": "string"
},
"exporterPortName": {
"type": "string"
},
"exporterServiceName": {
"type": "string"
},
"usageMetrics": {
"type": "boolean"
}
}
},
"logLevel": {
"type": "string"
},
Expand Down
16 changes: 16 additions & 0 deletions charts/vantage-kubernetes-agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,22 @@ agent:
# Optional. URL of an HTTP proxy used for external communications during the periodic report. This includes HTTP requests to the Vantage API and AWS S3. ie. http://example.com:3001
reportHTTPProxy: ""

gpu:
# Optional. Whether the agent should scrape GPU metrics or not. Requires NVIDIA dcgm-exporter to be deployed and configured with DCGM_FI_DEV_FB_USED and DCGM_FI_DEV_FB_TOTAL
usageMetrics: false
# Optional. The namespace the GPU exporter is deployed in.
# Uses agent default if not specified: "gpu-operator"
exporterNamespace: ""
# Optional. The service name for the GPU exporter. Used to lookup the endpoints.
# Uses agent default if not specified: "nvidia-dcgm-exporter"
exporterServiceName: ""
# Optional. The port name for the metrics port of the gpu exporter.
# Uses agent default if not specified: "gpu-metrics"
exporterPortName: ""
# Optional. The path the metrics endpoint is available under.
# Uses agent default if not specified: "/metrics"
exporterPath: ""


persist:
mountPath: "/var/lib/vantage-agent"
Expand Down

0 comments on commit 392babe

Please sign in to comment.