From e354d3b201a53eacd6b466fc083d354800da4422 Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Fri, 12 Apr 2024 17:01:12 -0500 Subject: [PATCH 1/8] Added Helm chart --- helm/Chart.yaml | 22 ++++++ helm/grafana-dashboards | 1 + helm/templates/_helpers.tpl | 51 +++++++++++++ helm/templates/dashboards.yaml | 17 +++++ helm/templates/deployment.yaml | 110 +++++++++++++++++++++++++++++ helm/templates/service.yaml | 22 ++++++ helm/templates/servicemonitor.yaml | 25 +++++++ helm/values.yaml | 81 +++++++++++++++++++++ 8 files changed, 329 insertions(+) create mode 100644 helm/Chart.yaml create mode 120000 helm/grafana-dashboards create mode 100644 helm/templates/_helpers.tpl create mode 100644 helm/templates/dashboards.yaml create mode 100644 helm/templates/deployment.yaml create mode 100644 helm/templates/service.yaml create mode 100644 helm/templates/servicemonitor.yaml create mode 100644 helm/values.yaml diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..7eba089 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: v2 +name: aws-quota-checker +description: AWS quota checker for Prometheus +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.0.0-dev +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.0.0-dev" diff --git a/helm/grafana-dashboards b/helm/grafana-dashboards new file mode 120000 index 0000000..4c4f702 --- /dev/null +++ b/helm/grafana-dashboards @@ -0,0 +1 @@ +../grafana-dashboards \ No newline at end of file diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000..d2f7aef --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,51 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "aws-quota-checker.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "aws-quota-checker.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "aws-quota-checker.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "aws-quota-checker.commonLabels" -}} +helm.sh/chart: {{ include "aws-quota-checker.chart" . }} +{{ include "aws-quota-checker.selectorCommonLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "aws-quota-checker.selectorCommonLabels" -}} +app.kubernetes.io/name: {{ include "aws-quota-checker.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/dashboards.yaml b/helm/templates/dashboards.yaml new file mode 100644 index 0000000..43e2180 --- /dev/null +++ b/helm/templates/dashboards.yaml @@ -0,0 +1,17 @@ +{{- if .Values.dashboards.enabled }} +{{- $files := .Files.Glob "grafana-dashboards/*.json" }} +{{- if $files }} +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: {{ include "aws-quota-checker.fullname" . }}-grafana-dashboards + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} + {{- with .Values.dashboards.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +data: +{{ $files.AsConfig | indent 2 }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml new file mode 100644 index 0000000..650592d --- /dev/null +++ b/helm/templates/deployment.yaml @@ -0,0 +1,110 @@ +{{- range .Values.checker.aws.regions }} +{{- $region := . }} +{{- with $ }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ printf "%s-%s" (include "aws-quota-checker.fullname" .) ($region) }} + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} + aws-quota-checker.gravitational.io/region: {{ $region | quote }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "aws-quota-checker.selectorCommonLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + aws-quota-checker.gravitational.io/region: {{ $region | quote }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + {{- if .Values.checker.enableDebugLogging }} + - --debug + {{- end }} + - prometheus-checker + - --port + - "8080" + - --region + - {{ $region | quote }} + {{- if .Values.checker.aws.profileName }} + - --profile + - {{ .Values.checker.aws.profileName | quote }} + {{- end }} + {{- if .Values.checker.aws.quotaLimitCheckIntervalSeconds }} + - --limits-check-interval + - {{ .Values.checker.aws.quotaLimitCheckIntervalSeconds | quote }} + {{- end }} + {{- if .Values.checker.aws.quotaCurrentValueCheckIntervalSeconds }} + - --currents-check-interval + - {{ .Values.checker.aws.quotaCurrentValueCheckIntervalSeconds | quote }} + {{- end }} + {{- if .Values.checker.aws.refreshResourcesIntervalSeconds }} + - --reload-checks-interval + - {{ .Values.checker.aws.refreshResourcesIntervalSeconds | quote }} + {{- end }} + {{- if .Values.checker.prometheus.metricsPrefix }} + - --namespace + - {{ .Values.checker.prometheus.metricsPrefix | quote }} + {{- end }} + {{- if .Values.checker.prometheus.enableDurationMetrics }} + - --enable-duration-metrics + {{- else }} + - --disable-duration-metrics + {{- end }} + - {{ join "," .Values.checker.aws.enabledChecks }} + {{- if .Values.checker.aws.credentialSecretName }} + envFrom: + - secretRef: + name: {{ .Values.checker.aws.credentialSecretName | quote }} + optional: false + {{- end }} + ports: + - name: metrics + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /metrics + port: metrics + readinessProbe: + httpGet: + path: /metrics + port: metrics + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml new file mode 100644 index 0000000..3cbe198 --- /dev/null +++ b/helm/templates/service.yaml @@ -0,0 +1,22 @@ +{{- range .Values.checker.aws.regions }} +{{- $region := . }} +{{- with $ }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ printf "%s-%s" (include "aws-quota-checker.fullname" .) ($region) }} + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "aws-quota-checker.selectorCommonLabels" . | nindent 4 }} + aws-quota-checker.gravitational.io/region: {{ $region | quote }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/servicemonitor.yaml b/helm/templates/servicemonitor.yaml new file mode 100644 index 0000000..0cc6460 --- /dev/null +++ b/helm/templates/servicemonitor.yaml @@ -0,0 +1,25 @@ +{{- if .Values.serviceMonitor.enabled }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "aws-quota-checker.fullname" . }} + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} +spec: + jobLabel: app.kubernetes.io/name + podTargetLabels: + - aws-quota-checker.gravitational.io/region + selector: + matchLabels: + {{- include "aws-quota-checker.selectorCommonLabels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace | quote }} + endpoints: + - honorLabels: true + path: /metrics + port: metrics + scheme: http + scrapeTimeout: 30s +{{- end }} \ No newline at end of file diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 0000000..8062298 --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,81 @@ +--- +# Default values for aws-quota-checker. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +image: + repository: ghcr.io/gravitational/aws-quota-checker + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +checker: + enableDebugLogging: false + aws: + regions: + - us-east-1 + - us-east-2 + - us-west-1 + - us-west-2 + enabledChecks: + - all + # credentialSecretName: "" # Optional. Contains the name of a secret in the namespace that has standard AWS credential environment vars. + # profileName: "" + # quotaLimitCheckIntervalSeconds: 600 + # quotaCurrentValueCheckIntervalSeconds: 300 + # refreshResourcesIntervalSeconds: 300 + prometheus: + # metricsPrefix: "" + enableDurationMetrics: true + +# Grafana dashboards +dashboards: + enabled: false + additionalLabels: + grafana_dashboard: "1" # Default label for Grafana Helm chart dashboard sidecar + +podAnnotations: {} +podLabels: {} + +podSecurityContext: + {} + # fsGroup: 2000 + +securityContext: + {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 10014 # Next available port from https://github.com/prometheus/prometheus/wiki/Default-port-allocations + +serviceMonitor: + enabled: false + +resources: + {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} From ccd407e20e5b36af8303083b8fc837f30b45a4dc Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Fri, 12 Apr 2024 18:05:06 -0500 Subject: [PATCH 2/8] Added support for a service account, to be used with IRSA --- helm/templates/deployment.yaml | 3 +++ helm/templates/serviceaccount.yaml | 13 +++++++++++++ helm/values.yaml | 5 +++++ 3 files changed, 21 insertions(+) create mode 100644 helm/templates/serviceaccount.yaml diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index 650592d..82b4ec5 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -33,6 +33,9 @@ spec: {{- end }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if .Values.serviceAccount.enabled }} + serviceAccountName: {{ include "aws-quota-checker.fullname" . }} + {{- end }} containers: - name: {{ .Chart.Name }} securityContext: diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..7f75a4e --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "aws-quota-checker.fullname" . }} + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/values.yaml b/helm/values.yaml index 8062298..829bd9a 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -61,6 +61,11 @@ service: serviceMonitor: enabled: false +serviceAccount: + enabled: true + annotations: + # key: value + resources: {} # We usually recommend not to specify default resources and to leave this as a conscious From ad01e8049edc678648048838f39a9d02cc96ec0a Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Tue, 16 Apr 2024 13:29:15 -0500 Subject: [PATCH 3/8] Added support for disabling specific dashboards --- helm/templates/_helpers.tpl | 14 ++++++++++++++ helm/templates/dashboards.yaml | 5 +---- helm/templates/deployment.yaml | 2 +- helm/values.yaml | 4 ++++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index d2f7aef..1d497bb 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -48,4 +48,18 @@ Selector labels {{- define "aws-quota-checker.selectorCommonLabels" -}} app.kubernetes.io/name: {{ include "aws-quota-checker.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Dashboards to deploy +*/}} +{{- define "aws-quota-checker.enabledDashboardsYaml" -}} +{{- $filteredFiles := dict }} +{{- range $fileName, $fileContent := .Files.Glob "grafana-dashboards/*.json" }} +{{- $baseFileName := base $fileName }} +{{- if not (has $baseFileName $.Values.dashboards.ignoredDashboards) }} +{{- $_ := set $filteredFiles $baseFileName ($fileContent | toString) }} +{{- end }} +{{- end }} +{{- $filteredFiles | toYaml }} {{- end }} \ No newline at end of file diff --git a/helm/templates/dashboards.yaml b/helm/templates/dashboards.yaml index 43e2180..bfd448e 100644 --- a/helm/templates/dashboards.yaml +++ b/helm/templates/dashboards.yaml @@ -1,6 +1,4 @@ {{- if .Values.dashboards.enabled }} -{{- $files := .Files.Glob "grafana-dashboards/*.json" }} -{{- if $files }} --- kind: ConfigMap apiVersion: v1 @@ -12,6 +10,5 @@ metadata: {{- toYaml . | nindent 4 }} {{- end }} data: -{{ $files.AsConfig | indent 2 }} -{{- end }} +{{ (include "aws-quota-checker.enabledDashboardsYaml" .) | indent 2 }} {{- end }} \ No newline at end of file diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index 82b4ec5..b7f3b68 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -46,7 +46,7 @@ spec: {{- if .Values.checker.enableDebugLogging }} - --debug {{- end }} - - prometheus-checker + - prometheus-exporter - --port - "8080" - --region diff --git a/helm/values.yaml b/helm/values.yaml index 829bd9a..35e1e13 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -35,6 +35,10 @@ checker: # Grafana dashboards dashboards: enabled: false + ignoredDashboards: + [] + # - dynamic-limits-dashboard.json + # - on-demand-ec2.json additionalLabels: grafana_dashboard: "1" # Default label for Grafana Helm chart dashboard sidecar From c6e4d22724ca94a46c6cee8697b11d25efe4b396 Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Tue, 16 Apr 2024 13:41:59 -0500 Subject: [PATCH 4/8] Removed pod label from service monitor (already exists) --- helm/templates/servicemonitor.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/helm/templates/servicemonitor.yaml b/helm/templates/servicemonitor.yaml index 0cc6460..b6145e4 100644 --- a/helm/templates/servicemonitor.yaml +++ b/helm/templates/servicemonitor.yaml @@ -8,8 +8,6 @@ metadata: {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} spec: jobLabel: app.kubernetes.io/name - podTargetLabels: - - aws-quota-checker.gravitational.io/region selector: matchLabels: {{- include "aws-quota-checker.selectorCommonLabels" . | nindent 6 }} From 8bf770ce6610c3273b6d3898bf180c7105eee5e5 Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Wed, 29 May 2024 21:36:51 -0500 Subject: [PATCH 5/8] Added alerts and fixed misc issues --- .../dynamic-limits-dashboard.json | 12 +-- helm/templates/_helpers.tpl | 56 ++++++++++---- .../templates/alert-rule/longRunningRule.yaml | 60 +++++++++++++++ helm/templates/alert-rule/quotaRule.yaml | 53 +++++++++++++ helm/templates/dashboards.yaml | 4 +- helm/templates/servicemonitor.yaml | 3 +- helm/values.yaml | 77 ++++++++++++------- 7 files changed, 215 insertions(+), 50 deletions(-) create mode 100644 helm/templates/alert-rule/longRunningRule.yaml create mode 100644 helm/templates/alert-rule/quotaRule.yaml diff --git a/grafana-dashboards/dynamic-limits-dashboard.json b/grafana-dashboards/dynamic-limits-dashboard.json index a3579c8..40d5b80 100644 --- a/grafana-dashboards/dynamic-limits-dashboard.json +++ b/grafana-dashboards/dynamic-limits-dashboard.json @@ -126,7 +126,7 @@ "disableTextWrap": false, "editorMode": "code", "exemplar": false, - "expr": "sort_desc((max by(quota, account, service_code) ({__name__=~\"awsquota_.*$\", __name__!~\"awsquota_.*_limit$\", exported_instance=\"\", scope=\"ACCOUNT\"}) / max by(quota, account, service_code) ({__name__=~\"awsquota_.*_limit$\", exported_instance=\"\", scope=\"ACCOUNT\"})) >= $MinQuotaThreashold/100)", + "expr": "sort_desc((max by(quota, account, service_code) ({__name__=~\"awsquota_.*$\", __name__!~\"awsquota_.*_limit$\", aws_resource=\"\", scope=\"ACCOUNT\"}) / max by(quota, account, service_code) ({__name__=~\"awsquota_.*_limit$\", aws_resource=\"\", scope=\"ACCOUNT\"})) >= $MinQuotaThreashold/100)", "format": "time_series", "fullMetaSearch": false, "hide": false, @@ -209,7 +209,7 @@ "disableTextWrap": false, "editorMode": "code", "exemplar": false, - "expr": "sort_desc((max by(quota, region, service_code) ({__name__=~\"awsquota_.*$\", __name__!~\"awsquota_.*_limit$\", region=~\"$region\", exported_instance=\"\", scope=\"REGION\"}) / max by(quota, region, service_code) ({__name__=~\"awsquota_.*_limit$\", region=~\"$region\", exported_instance=\"\", scope=\"REGION\"})) >= $MinQuotaThreashold/100)", + "expr": "sort_desc((max by(quota, region, service_code) ({__name__=~\"awsquota_.*$\", __name__!~\"awsquota_.*_limit$\", region=~\"$region\", aws_resource=\"\", scope=\"REGION\"}) / max by(quota, region, service_code) ({__name__=~\"awsquota_.*_limit$\", region=~\"$region\", aws_resource=\"\", scope=\"REGION\"})) >= $MinQuotaThreashold/100)", "format": "time_series", "fullMetaSearch": false, "hide": false, @@ -234,7 +234,7 @@ "color": { "mode": "thresholds" }, - "displayName": "[${__field.labels.exported_instance}] ${__field.labels.service_code} | ${__field.labels.quota}", + "displayName": "[${__field.labels.aws_resource}] ${__field.labels.service_code} | ${__field.labels.quota}", "fieldMinMax": false, "mappings": [], "max": 1, @@ -292,7 +292,7 @@ "disableTextWrap": false, "editorMode": "code", "exemplar": false, - "expr": "sort_desc(max by(quota, exported_instance, service_code) ({__name__=~\"awsquota_.*$\", __name__!~\"awsquota_.*_limit$\", scope=\"INSTANCE\", region=~\"$region\"}) / max by(quota, exported_instance, service_code) ({__name__=~\"awsquota_.*_limit$\", scope=\"INSTANCE\", region=~\"$region\"}) >= $MinQuotaThreashold/100)", + "expr": "sort_desc(max by(quota, aws_resource, service_code) ({__name__=~\"awsquota_.*$\", __name__!~\"awsquota_.*_limit$\", scope=\"INSTANCE\", region=~\"$region\"}) / max by(quota, aws_resource, service_code) ({__name__=~\"awsquota_.*_limit$\", scope=\"INSTANCE\", region=~\"$region\"}) >= $MinQuotaThreashold/100)", "format": "time_series", "fullMetaSearch": false, "hide": false, @@ -368,7 +368,7 @@ "$__all" ] }, - "definition": "label_values(awsquota_check_count,region)", + "definition": "label_values(awsquota_info,region)", "hide": 0, "includeAll": true, "label": "region", @@ -377,7 +377,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(awsquota_check_count,region)", + "query": "label_values(awsquota_info,region)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 1d497bb..98fd3b9 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -2,7 +2,7 @@ Expand the name of the chart. */}} {{- define "aws-quota-checker.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- default $.Chart.Name $.Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} {{/* @@ -11,14 +11,14 @@ We truncate at 63 chars because some Kubernetes name fields are limited to this If release name contains chart name it will be used as a full name. */}} {{- define "aws-quota-checker.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- if $.Values.fullnameOverride }} +{{- $.Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- $name := default $.Chart.Name $.Values.nameOverride }} +{{- if contains $name $.Release.Name }} +{{- $.Release.Name | trunc 63 | trimSuffix "-" }} {{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- printf "%s-%s" $.Release.Name $name | trunc 63 | trimSuffix "-" }} {{- end }} {{- end }} {{- end }} @@ -27,7 +27,7 @@ If release name contains chart name it will be used as a full name. Create chart name and version as used by the chart label. */}} {{- define "aws-quota-checker.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- printf "%s-%s" $.Chart.Name $.Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* @@ -36,10 +36,10 @@ Common labels {{- define "aws-quota-checker.commonLabels" -}} helm.sh/chart: {{ include "aws-quota-checker.chart" . }} {{ include "aws-quota-checker.selectorCommonLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- if $.Chart.AppVersion }} +app.kubernetes.io/version: {{ $.Chart.AppVersion | quote }} {{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/managed-by: {{ $.Release.Service }} {{- end }} {{/* @@ -47,7 +47,7 @@ Selector labels */}} {{- define "aws-quota-checker.selectorCommonLabels" -}} app.kubernetes.io/name: {{ include "aws-quota-checker.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/instance: {{ $.Release.Name }} {{- end }} {{/* @@ -57,9 +57,39 @@ Dashboards to deploy {{- $filteredFiles := dict }} {{- range $fileName, $fileContent := .Files.Glob "grafana-dashboards/*.json" }} {{- $baseFileName := base $fileName }} -{{- if not (has $baseFileName $.Values.dashboards.ignoredDashboards) }} +{{- if not (has $baseFileName $.Values.visualization.dashboards.ignoredDashboards) }} {{- $_ := set $filteredFiles $baseFileName ($fileContent | toString) }} {{- end }} {{- end }} {{- $filteredFiles | toYaml }} +{{- end }} + +{{/* +Rule template for request duration +*/}} +{{- define "aws-quota-checker.requestDurationRule" -}} +{{- $alertValues := .AlertValues -}} +{{- $querySuffix := .QuerySuffix -}} +{{- $verbPhrase := .VerbPhrase -}} +{{- $alertName := .AlertName -}} +{{ $ := .Context }} +{{- if $alertValues.enabled }} + - alert: AWSQuota {{- $alertName }} + expr: >- + avg_over_time( + avg by (quota, account, region) ( + { + __name__=~"awsquota_.*_{{ $querySuffix }}", + job=~"{{ printf "%s-.*" (include "aws-quota-checker.fullname" $) }}", + namespace="{{ $.Release.Namespace }}" + } + )[{{ $alertValues.duration }}:] + ) > {{ $alertValues.thresholdSeconds }} + for: {{ $alertValues.duration }} + annotations: + description: >- + {{ $verbPhrase }} has averaged more than {{ $alertValues.thresholdSeconds }} seconds + over the past {{ $alertValues.duration }}. + summary: {{ $verbPhrase }} is taking too long. +{{- end }} {{- end }} \ No newline at end of file diff --git a/helm/templates/alert-rule/longRunningRule.yaml b/helm/templates/alert-rule/longRunningRule.yaml new file mode 100644 index 0000000..da51f1d --- /dev/null +++ b/helm/templates/alert-rule/longRunningRule.yaml @@ -0,0 +1,60 @@ +{{- $enableQuotaRule := false -}} +{{- range $rule := values .Values.alerting.prometheusRules.requestDuration -}} +{{- $enableQuotaRule = or $enableQuotaRule $rule.enabled -}} +{{- end -}} +{{- if $enableQuotaRule -}} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "aws-quota-checker.fullname" . }}.request-rules + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} + {{- with .Values.visualization.dashboards.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: aws_quota_checker.rules.queries + rules: +{{- + include "aws-quota-checker.requestDurationRule" ( + dict + "AlertValues" .Values.alerting.prometheusRules.requestDuration.longRunningLimitQuery + "QuerySuffix" "limit_duration_seconds" + "VerbPhrase" "Querying for one or more quota limits" + "AlertName" "LongRunningQuotaLimitQuery" + "Context" $ + ) +}} +{{- + include "aws-quota-checker.requestDurationRule" ( + dict + "AlertValues" .Values.alerting.prometheusRules.requestDuration.longRunningCountQuery + "QuerySuffix" "count_duration_seconds" + "VerbPhrase" "Querying for one or more quota counts" + "AlertName" "LongRunningQuotaCountQuery" + "Context" $ + ) +}} +{{- + include "aws-quota-checker.requestDurationRule" ( + dict + "AlertValues" .Values.alerting.prometheusRules.requestDuration.longRunningLimitQueryTotal + "QuerySuffix" "checks_duration_seconds" + "VerbPhrase" "Querying for all quota limits" + "AlertName" "LongRunningQuotaLimitQueryTotal" + "Context" $ + ) +}} +{{- + include "aws-quota-checker.requestDurationRule" ( + dict + "AlertValues" .Values.alerting.prometheusRules.requestDuration.longRunningCountQueryTotal + "QuerySuffix" "currents_duration_seconds" + "VerbPhrase" "Querying for all quota counts" + "AlertName" "LongRunningQuotaCountQueryTotal" + "Context" $ + ) +}} +{{- end }} \ No newline at end of file diff --git a/helm/templates/alert-rule/quotaRule.yaml b/helm/templates/alert-rule/quotaRule.yaml new file mode 100644 index 0000000..2052c5b --- /dev/null +++ b/helm/templates/alert-rule/quotaRule.yaml @@ -0,0 +1,53 @@ +{{- if gt (len .Values.alerting.prometheusRules.quotas) 0}} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "aws-quota-checker.fullname" . }}.quota-rules + labels: + {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} + {{- with .Values.visualization.dashboards.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: aws_quota_checker.rules.quotas + rules: +{{- range .Values.alerting.prometheusRules.quotas }} +{{- $quotaName := .quotaName }} +{{- $alertName := .alertName }} +{{- $countQuery := printf "__name__=~\"awsquota_%s\"" $quotaName }} +{{- $limitQuery := printf "__name__=~\"awsquota_%s_limit\"" $quotaName }} +{{- if eq $quotaName "all" }} +{{- $countQuery = "__name__=~\"awsquota_.*\",\n__name__!~\"awsquota_(check_count|info|.*_duration_seconds|.*_limit)\""}} +{{- $limitQuery = "__name__=~\"awsquota_.*_limit\"" }} +{{- end }} + - alert: AWSQuota {{- $alertName }} + expr: >- + sum by (account, region, aws_resource, quota, scope) ( + { + {{- $countQuery | nindent 18 }}, + job=~"{{ printf "%s-.*" (include "aws-quota-checker.fullname" $) }}", + namespace="{{ $.Release.Namespace }}" + } + ) + / + sum by (account, region, aws_resource, quota, scope) ( + { + {{- $limitQuery | nindent 18 }}, + job=~"{{ printf "%s-.*" (include "aws-quota-checker.fullname" $) }}", + namespace="{{ $.Release.Namespace }}" + } + ) + > {{ .threshold }} + {{- if .duration }} + for: {{ .duration }} + {{- end }} + annotations: + description: >- + Quota threshold of {{ mulf .threshold 100 }}% for {{ "{{" }} $labels.quota {{ "}}" }} + {{ "{{" }} if $labels.resource {{ "}}" }} on resource {{ "{{" }} $labels.resource {{ "}}" }}{{ "{{" }} end {{ "}}" }} + in {{ "{{" }} $labels.account {{ "}}" }}/{{ "{{" }}$labels.region{{ "}}" }} has been reached. + summary: Reached quota threshold for {{ "{{" }} $labels.quota {{ "}}" }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/dashboards.yaml b/helm/templates/dashboards.yaml index bfd448e..910b45f 100644 --- a/helm/templates/dashboards.yaml +++ b/helm/templates/dashboards.yaml @@ -1,4 +1,4 @@ -{{- if .Values.dashboards.enabled }} +{{- if .Values.visualization.dashboards.enabled }} --- kind: ConfigMap apiVersion: v1 @@ -6,7 +6,7 @@ metadata: name: {{ include "aws-quota-checker.fullname" . }}-grafana-dashboards labels: {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} - {{- with .Values.dashboards.additionalLabels }} + {{- with .Values.visualization.dashboards.additionalLabels }} {{- toYaml . | nindent 4 }} {{- end }} data: diff --git a/helm/templates/servicemonitor.yaml b/helm/templates/servicemonitor.yaml index b6145e4..4be6ca0 100644 --- a/helm/templates/servicemonitor.yaml +++ b/helm/templates/servicemonitor.yaml @@ -1,4 +1,4 @@ -{{- if .Values.serviceMonitor.enabled }} +{{- if .Values.monitoring.serviceMonitor.enabled }} --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor @@ -7,7 +7,6 @@ metadata: labels: {{- include "aws-quota-checker.commonLabels" . | nindent 4 }} spec: - jobLabel: app.kubernetes.io/name selector: matchLabels: {{- include "aws-quota-checker.selectorCommonLabels" . | nindent 6 }} diff --git a/helm/values.yaml b/helm/values.yaml index 35e1e13..7b3bba3 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -1,7 +1,5 @@ --- # Default values for aws-quota-checker. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. image: repository: ghcr.io/gravitational/aws-quota-checker @@ -32,39 +30,68 @@ checker: # metricsPrefix: "" enableDurationMetrics: true -# Grafana dashboards -dashboards: - enabled: false - ignoredDashboards: - [] - # - dynamic-limits-dashboard.json - # - on-demand-ec2.json - additionalLabels: - grafana_dashboard: "1" # Default label for Grafana Helm chart dashboard sidecar +monitoring: + serviceMonitor: + enabled: true + +alerting: + prometheusRules: + requestDuration: + longRunningLimitQuery: + enabled: true + thresholdSeconds: 0.5 + duration: 5m + longRunningCountQuery: + enabled: true + thresholdSeconds: 1 + duration: 5m + longRunningLimitQueryTotal: + enabled: true + thresholdSeconds: 0.5 + duration: 5m + longRunningCountQueryTotal: + enabled: true + thresholdSeconds: 1 + duration: 5m + quotas: + - alertName: General + quotaName: all + duration: 5m + threshold: 0.9 + # - alertName: ECRImagesPerRepository + # quotaName: ecr_images_per_repository + # duration: 5m + # threshold: 0.05 + +visualization: + # Grafana dashboards + dashboards: + enabled: false + ignoredDashboards: + [] + # - dynamic-limits-dashboard.json + # - on-demand-ec2.json + additionalLabels: + grafana_dashboard: "1" # Default label for Grafana Helm chart dashboard sidecar podAnnotations: {} podLabels: {} podSecurityContext: - {} - # fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 securityContext: - {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true service: type: ClusterIP port: 10014 # Next available port from https://github.com/prometheus/prometheus/wiki/Default-port-allocations -serviceMonitor: - enabled: false - serviceAccount: enabled: true annotations: @@ -72,10 +99,6 @@ serviceAccount: resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. # limits: # cpu: 100m # memory: 128Mi From 08a2b2800f083ca4216843734428e12e26421e18 Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Wed, 29 May 2024 23:38:20 -0500 Subject: [PATCH 6/8] Set more reasonable values for query alert thresholds/durations --- helm/values.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/helm/values.yaml b/helm/values.yaml index 7b3bba3..080624c 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -40,19 +40,19 @@ alerting: longRunningLimitQuery: enabled: true thresholdSeconds: 0.5 - duration: 5m + duration: 15m longRunningCountQuery: enabled: true - thresholdSeconds: 1 - duration: 5m + thresholdSeconds: 5 + duration: 15m longRunningLimitQueryTotal: enabled: true - thresholdSeconds: 0.5 - duration: 5m + thresholdSeconds: 300 + duration: 30m longRunningCountQueryTotal: enabled: true - thresholdSeconds: 1 - duration: 5m + thresholdSeconds: 180 + duration: 30m quotas: - alertName: General quotaName: all From 07c34258b758b0f0dfef043d2821357d651a7599 Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Wed, 29 May 2024 23:43:30 -0500 Subject: [PATCH 7/8] Disable prometheus operator custom resources by default for better out of box compatibility --- helm/values.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/helm/values.yaml b/helm/values.yaml index 080624c..b3cef3c 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -32,32 +32,33 @@ checker: monitoring: serviceMonitor: - enabled: true + enabled: false alerting: prometheusRules: requestDuration: longRunningLimitQuery: - enabled: true + enabled: false thresholdSeconds: 0.5 duration: 15m longRunningCountQuery: - enabled: true + enabled: false thresholdSeconds: 5 duration: 15m longRunningLimitQueryTotal: - enabled: true + enabled: false thresholdSeconds: 300 duration: 30m longRunningCountQueryTotal: - enabled: true + enabled: false thresholdSeconds: 180 duration: 30m quotas: - - alertName: General - quotaName: all - duration: 5m - threshold: 0.9 + [] + # - alertName: General + # quotaName: all # Special case to monitor all enabled metrics + # duration: 5m + # threshold: 0.9 # - alertName: ECRImagesPerRepository # quotaName: ecr_images_per_repository # duration: 5m From 6d242d38630036a88469b32559a430e0383c5be5 Mon Sep 17 00:00:00 2001 From: Fred Heinecke Date: Thu, 30 May 2024 00:55:51 -0500 Subject: [PATCH 8/8] Added ability to set additional alert labels --- helm/templates/_helpers.tpl | 4 ++++ helm/templates/alert-rule/quotaRule.yaml | 4 ++++ helm/values.yaml | 2 ++ 3 files changed, 10 insertions(+) diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 98fd3b9..2bffb31 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -91,5 +91,9 @@ Rule template for request duration {{ $verbPhrase }} has averaged more than {{ $alertValues.thresholdSeconds }} seconds over the past {{ $alertValues.duration }}. summary: {{ $verbPhrase }} is taking too long. + {{- if $.Values.alerting.prometheusRules.additionalLabels }} + labels: + {{- $.Values.alerting.prometheusRules.additionalLabels | toYaml | trim | nindent 12 }} + {{- end }} {{- end }} {{- end }} \ No newline at end of file diff --git a/helm/templates/alert-rule/quotaRule.yaml b/helm/templates/alert-rule/quotaRule.yaml index 2052c5b..082d4ee 100644 --- a/helm/templates/alert-rule/quotaRule.yaml +++ b/helm/templates/alert-rule/quotaRule.yaml @@ -49,5 +49,9 @@ spec: {{ "{{" }} if $labels.resource {{ "}}" }} on resource {{ "{{" }} $labels.resource {{ "}}" }}{{ "{{" }} end {{ "}}" }} in {{ "{{" }} $labels.account {{ "}}" }}/{{ "{{" }}$labels.region{{ "}}" }} has been reached. summary: Reached quota threshold for {{ "{{" }} $labels.quota {{ "}}" }} + {{- if $.Values.alerting.prometheusRules.additionalLabels }} + labels: + {{- $.Values.alerting.prometheusRules.additionalLabels | toYaml | trim | nindent 12 }} + {{- end }} {{- end }} {{- end }} \ No newline at end of file diff --git a/helm/values.yaml b/helm/values.yaml index b3cef3c..4e54382 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -63,6 +63,8 @@ alerting: # quotaName: ecr_images_per_repository # duration: 5m # threshold: 0.05 + additionalLabels: + # labelName: labelValue visualization: # Grafana dashboards