diff --git a/charts/opencost-config/dashboards/OpenCost-multiday-summary.json b/charts/opencost-config/dashboards/OpenCost-multiday-summary.json deleted file mode 100644 index 1bb2027..0000000 --- a/charts/opencost-config/dashboards/OpenCost-multiday-summary.json +++ /dev/null @@ -1,742 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "description": "Based on the average total hourly node cost", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 2, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "semi-dark-blue", - "value": null - } - ] - }, - "unit": "currencyUSD" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 0 - }, - "id": 2, - "options": { - "colorMode": "background_solid", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "value", - "wideLayout": true - }, - "pluginVersion": "11.1.4", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "expr": "sum(avg by (exported_instance) (node_total_hourly_cost)) * 24 * ${time_period_days}", - "instant": false, - "interval": "", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "timeFrom": "${time_period_days}d", - "title": "Estimated cluster cost", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "description": "Based on total node price", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "semi-dark-blue", - "value": null - } - ] - }, - "unit": "currencyUSD" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 16, - "x": 8, - "y": 0 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "expr": "sum(node_total_hourly_cost)", - "instant": false, - "legendFormat": "Cluster Hour Cost", - "range": true, - "refId": "A" - } - ], - "timeFrom": "${time_period_days}d", - "title": "Cluster Hour Cost", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "decimals": 2, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#73BF69", - "value": null - } - ] - }, - "unit": "currencyUSD" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "custom.cellOptions", - "value": { - "mode": "gradient", - "type": "gauge", - "valueDisplayMode": "text" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 6 - }, - "id": 4, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "Cost" - } - ] - }, - "pluginVersion": "11.1.4", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "exemplar": false, - "expr": "topk(20,\n sum(\n sum(container_memory_allocation_bytes) \n by (exported_namespace,exported_instance)\n * on(exported_instance) group_left() (\n\t\t\tnode_ram_hourly_cost{} / 1024 / 1024 / 1024 * (24 * ${time_period_days})\n\t\t)\n\n +\n\n sum(container_cpu_allocation) \n by (exported_namespace,exported_instance)\n * on(exported_instance) group_left() (\n\t\t\tnode_cpu_hourly_cost{} * (24 * ${time_period_days})\n\t\t)\n ) by (exported_namespace)\n)", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "top_namespaces" - } - ], - "timeFrom": "${time_period_days}d", - "title": "Top 20 namespaces by estimated cost", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "includeByName": {}, - "indexByName": {}, - "renameByName": { - "Value": "Cost", - "exported_namespace": "Namespace", - "namespace": "Namespace" - } - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "decimals": 2, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#73BF69", - "value": null - } - ] - }, - "unit": "currencyUSD" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "custom.cellOptions", - "value": { - "mode": "gradient", - "type": "gauge", - "valueDisplayMode": "text" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 6 - }, - "id": 5, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "Cost" - } - ] - }, - "pluginVersion": "11.1.4", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "exemplar": false, - "expr": "topk(20,sum by (exported_namespace,owner_name)(((sum(container_memory_allocation_bytes) \n\tby (instance,exported_instance,exported_namespace,pod)\n\t+ on (instance,exported_namespace,pod) group_left(owner_name) kube_pod_owner{} * 0)\n\t* on(exported_instance) group_left() (\n\t\tnode_ram_hourly_cost{instance_type != \"\", job =\"opencost-exporter\"} / 1024 / 1024 / 1024 * (24 * ${time_period_days})\n \t))\n\t+\n\t((sum(container_cpu_allocation) \n\tby (instance,exported_instance,exported_namespace,pod)\n\t+ on (instance,exported_namespace,pod) group_left(owner_name) kube_pod_owner{} * 0)\n\t* on(exported_instance) group_left() (\n\t\tnode_cpu_hourly_cost{instance_type != \"\", job =\"opencost-exporter\"} * (24 * ${time_period_days})\n \t))\n\t))", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "top_namespaces" - } - ], - "timeFrom": "${time_period_days}d", - "title": "Top 20 workloads by estimated cost", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "includeByName": {}, - "indexByName": { - "Time": 0, - "Value": 3, - "exported_namespace": 2, - "owner_name": 1 - }, - "renameByName": { - "Value": "Cost", - "exported_namespace": "Namespace", - "namespace": "Namespace", - "owner_name": "Workload" - } - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "decimals": 2, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#73BF69", - "value": null - } - ] - }, - "unit": "currencyUSD" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "custom.cellOptions", - "value": { - "mode": "gradient", - "type": "gauge", - "valueDisplayMode": "text" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 6 - }, - "id": 6, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [] - }, - "pluginVersion": "11.1.4", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "exemplar": false, - "expr": "topk(20,\n\t\tsum(\n\t\t sum(container_memory_allocation_bytes) by (instance,exported_namespace,exported_instance, container)\n\t\t * on(exported_instance) group_left() (\n\t\t\t\t node_ram_hourly_cost{instance_type != \"\", job =\"opencost-exporter\", orig_namespace = \"\"} / 1024 / 1024 / 1024 * (24 * ${time_period_days})\n\t\t\t )\n\t\t +\n\t\t sum(container_cpu_allocation) by (instance,exported_namespace,exported_instance, container)\n\t\t * on(exported_instance) group_left() (\n\t\t\t\t node_cpu_hourly_cost{instance_type != \"\", job =\"opencost-exporter\", orig_namespace = \"\"} * (24 * ${time_period_days})\n\t\t\t )\n\t\t) by (exported_namespace, container)\n\t )", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "top_namespaces" - } - ], - "timeFrom": "${time_period_days}d", - "title": "Top 20 containers by estimated cost", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "includeByName": {}, - "indexByName": { - "Time": 0, - "Value": 3, - "container": 1, - "exported_namespace": 2 - }, - "renameByName": { - "Value": "Cost", - "container": "Container", - "exported_namespace": "Namespace", - "namespace": "Namespace" - } - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "description": "NOTE: only namespaces with < 80% efficiency shown", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "#EAB839", - "value": 25 - }, - { - "color": "green", - "value": 65 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 14, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 7, - "options": { - "displayMode": "lcd", - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color" - }, - "pluginVersion": "11.1.4", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "expr": "(sum by (namespace_name)(avg_over_time(kubernetes_io:container_memory_used_bytes[5m])) / sum by (namespace_name)(avg_over_time(kubernetes_io:container_memory_request_bytes[5m])) * 100) < 79", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "timeFrom": "${time_period_days}d", - "title": "Memory efficiency (%) by Namespace", - "type": "bargauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "description": "NOTE: only namespaces with < 80% efficiency shown", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "yellow", - "value": 25 - }, - { - "color": "green", - "value": 65 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 14, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 8, - "options": { - "displayMode": "lcd", - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color" - }, - "pluginVersion": "11.1.4", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PFB5ABA51A8A585D7" - }, - "editorMode": "code", - "expr": "((sum by (namespace_name)(rate(kubernetes_io:container_cpu_core_usage_time{monitored_resource=\"k8s_container\"}[${__interval}])) / sum by (namespace_name)(avg_over_time(kubernetes_io:container_cpu_request_cores{monitored_resource=\"k8s_container\"}[${__interval}]))) * 100) < 80", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "timeFrom": "${time_period_days}d", - "title": "CPU efficiency (%) by Namespace", - "type": "bargauge" - } - ], - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": true, - "text": "1", - "value": "1" - }, - "hide": 0, - "includeAll": false, - "label": "Time period (days)", - "multi": false, - "name": "time_period_days", - "options": [ - { - "selected": true, - "text": "1", - "value": "1" - }, - { - "selected": false, - "text": "7", - "value": "7" - }, - { - "selected": false, - "text": "30", - "value": "30" - } - ], - "query": "1,7,30", - "queryValue": "", - "skipUrlSync": false, - "type": "custom" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "hidden": true - }, - "timezone": "browser", - "title": "Multi-day summary", - "uid": "adxky3sjvubcwc", - "version": 13, - "weekStart": "" -} \ No newline at end of file diff --git a/content/architecture.md b/content/architecture.md index cec47b1..1334060 100644 --- a/content/architecture.md +++ b/content/architecture.md @@ -10,7 +10,4 @@ FinOps Stack makes use of a wide range of software to provide a highly efficient - **gmp-proxy**: A proxy designed to simplify the integration between open-source Grafana and Google Managed Prometheus. - **[Cert-Manager](https://cert-manager.io/)**: An open source tool that provides certificate management for Kubernetes. In the FinOps Stack, cert-manager can be optionally installed to create a tls certificate if using ingress for Grafana. -## What does this look like: -This Diagram describes what the FinOps Stack looks like within your cluster: -![Architecture Diagram of FinOpsStack](assets/architecture.png) diff --git a/content/dashboards.md b/content/dashboards.md new file mode 100644 index 0000000..0f766c7 --- /dev/null +++ b/content/dashboards.md @@ -0,0 +1,59 @@ +# Grafana dashboards & Prometheus + +Depending on how you are runnning Prometheus and Grafana, you might need to tweak the PromQL used in the various dashboards provided in [our chart](https://github.com/jetstack/finops-stack/tree/main/charts/opencost-config/dashboards). This page contains notes and troubleshooting tips based on our experiences. + +## Prometheus + +### Duplicated labels + +The Opencost metrics use the `instance` and `namespace` labels to identify the _pod and namespace to which the metric relates_, i.e. _not_ the Opencost Prometheus Exporter pod. Depending on how your Prometheus service is configured or if you are using Google Managed Prometheus (GMP) the values of these labels could be overwritten, namely: + +- GMP replaces the `instance` and `namespace` values with the name and namespace of the pod the metrics were scraped from. In our case this is always the Exporter pod in the `finops-stack` namespace. + +- In standalone Prometheus the same behaviour as above will occur unless config value `honor_labels` has been set to true. + +In both cases the original values of the namespace and instance labels will be copied to `exported_namespace` and `exported_instance` respectively. + +__NOTE:__ Our dashboards use the `exported_*` labels as GMP does not support the `honor_labels` configuration. + +### Query execution time + +Several of the PromQL queries use group_left 'joins'; be aware these can get expensive and time consuming if you select very broad time ranges. + +### Accessing Google Managed Prometheus APIs + +**Only applicable if you are using GKE + Google Managed Prometheus** + +GMP provides endpoints for almost all of Prometheus' API (as documented [here](https://cloud.google.com/stackdriver/docs/managed-prometheus/query-api-ui#http-api-details)); like all of GCP's APIs they require a valid Bearer token in the Authorization header. Thus, the Grafana GMP data source needs a valid token. Google do provide 2 suggestions for automating this process ([here](https://cloud.google.com/stackdriver/docs/managed-prometheus/query-api-ui)). Neither solution is simple and both have downsides, so we provide an alternative via the [`gmp-proxy` chart](https://github.com/jetstack/finops-stack/tree/main/charts/gmp-proxy) in the `finops-stack` repo. This chart installs an Envoy Proxy workload which is configured to add a valid Bearer token to each GMP API request before forwarding it to the GMP endpoint. + +#### Usage + +##### Pre-requisites + +The GMP Proxy Pod uses GCP Workload Identity so you'll need to associate the SA used by the Pod with a GCP SA that has the following roles: + +- monitoring.viewer +- iam.serviceAccountTokenCreator + +##### Envoy Proxy image + +The standard Envoy Docker image can't be used as is because an additional Lua library is required, so we have provided a custom image which includes this library. If you prefer to use your own image, this is the `Dockerfile` we used: + +``` +# Image from https://hub.docker.com/r/envoyproxy/envoy +FROM envoyproxy/envoy:v1.31-latest +RUN apt update && apt install -y luarocks +RUN luarocks install lua-cjson +``` + +## Dashboards + +Currently the working and tested dashboards are: + +### FinOps Stack: Cluster cost & efficiency + +The purpose of this dashboard is to provide an overview of the cost of your cluster and namespaces using Opencost's calculations. The time range control is disabled and you can only select from 3 time periods using a drop down in the top left. This is because some of the queries are expensive and with too long a time period cause the query request to timeout. + +### FinOps Stack: Used | Wasted Resources for cluster & namespace + +The purpose of this dashboard is to provide details of resource usage and wastage at the cluster, namespace and pod level. You can select different time periods using the standard Grafana time range control in the top right. diff --git a/content/install.md b/content/install.md index a13659d..63969c3 100644 --- a/content/install.md +++ b/content/install.md @@ -6,7 +6,7 @@ title: Quickstart To simplify installation, the FinOps Stack is installed using a single Helmfile command. -The following instructions are designed to work with GKE standard and GKE autopilot. For full instructions, prerequisites and customisations, please see the installation README. +The following instructions are designed to work with GKE standard and GKE autopilot. For full instructions, prerequisites and customisations, please see the [installation README](https://github.com/jetstack/finops-stack/blob/main/installation/README.md). ### Helmfile diff --git a/content/intro.md b/content/intro.md index d5f8241..9606252 100644 --- a/content/intro.md +++ b/content/intro.md @@ -27,6 +27,6 @@ More detail on each policy can be found in the dedicated Policies page. Overall cluster efficiency, and especially right-sizing container requests and limits, is essential to running an optimised Kubernetes cluster. -Goldilocks is included in the FinOps Stack to provide guidance on right-sized container requests and limits, and to complement the visualisations in the Stack's Grafana dashboard. +Goldilocks is included in the FinOps Stack to provide guidance on right-sized container requests and limits, and to complement the visualisations in the Stack's Grafana dashboard. -In addition, see the distribution-gke-autopilot page of this website to see a detailed breakdown of considerations for achieving greater cluster efficiency in GKE Autopilot. \ No newline at end of file +In addition, see the [GKE Autopilot page](./distribution-gke-autopilot.md) to see a detailed breakdown of considerations for achieving greater cluster efficiency in GKE Autopilot. diff --git a/content/policies.md b/content/policies.md index 3eb4016..d5ac1f3 100644 --- a/content/policies.md +++ b/content/policies.md @@ -9,10 +9,9 @@ Additionally, policy enforcement helps maintain a balance between agility and go ## Policy Examples -Within the FinOps Stack, we've provided some policies we believe to be important when considering implementing policy enforcement and FinOps practices. Each policy and its function is described below. +Within the FinOps Stack, we've provided some policies we believe to be important when considering implementing policy enforcement and FinOps practices. Each policy and its function is described below. By default all policies are set to 'Audit' rather than 'Enforce'; this can be changed for individual policies or all of them via Helm values. All policies can be found in `./charts/finops-policies/templates`. - -### Block Large images +### Block Large images ([`block-large-image.yaml`](https://github.com/jetstack/finops-stack/blob/main/charts/finops-policies/templates/block-large-image.yaml)) Containers with excessively large image sizes can significantly impact the performance and efficiency of Kubernetes clusters. Large images take longer to pull from the registry, increasing deployment times and potentially causing delays in application start-up, which can disrupt critical operations. Additionally, these images consume more disk space on the nodes, which can lead to node resource exhaustion and reduce the available capacity for other workloads. Large images may also create unnecessary network traffic, particularly when pulled across multiple nodes, impacting overall cluster performance and increasing operational costs. @@ -20,7 +19,7 @@ This policy is designed to mitigate such risks by enforcing a strict limit on co By enforcing this policy, organisations can maintain a standardised, efficient, and secure environment, ensuring that all workloads deployed are optimised for performance and resource usage. This policy also promotes best practices in image optimisation, encouraging development teams to reduce bloat, use multi-stage builds, and implement efficient base images to keep container sizes within acceptable limits. -### Validate Cost Center Label +### Validate Cost Center Label ([`validate-cost-center-label.yaml`](https://github.com/jetstack/finops-stack/blob/main/charts/finops-policies/templates/validate-cost-center-label.yaml)) Labels in Kubernetes are key/value pairs used to organise and manage workloads by adding metadata that is meaningful to users but does not affect the core functionality of the system. They play a crucial role in filtering, grouping, and selecting objects, such as pods, for operational tasks like deployments, scaling, monitoring, and cost tracking. Proper labelling ensures clarity and consistency across teams, enabling effective resource allocation, automation, and management within the cluster. @@ -28,7 +27,7 @@ This policy enforces the requirement that all Pods deployed within the cluster m By enforcing this policy, organisations ensure that all workloads are tagged with relevant financial metadata, which can be used to generate detailed cost reports and dashboards, monitor spending per team or project, and prevent unaccounted-for resource usage. It also promotes accountability and transparency across teams, ensuring that cloud resources are aligned with business objectives. Failure to apply this label may result in unidentified costs, making it difficult to optimise budgets or identify areas where cost-saving measures are needed. -### Restrict Scale of Deployments/Statefulsets, etc +### Restrict Scale of Deployments/Statefulsets, etc ([`restrict_scale.yaml`](https://github.com/jetstack/finops-stack/blob/main/charts/finops-policies/templates/restrict_scale.yaml)) Pod controllers, such as Deployments, manage replicas of pods to ensure high availability, scalability, and resiliency of applications running in Kubernetes. These controllers use the /scale subresource to dynamically adjust the number of pod replicas, either manually or through auto-scaling mechanisms. While scaling is critical to meeting demand, uncontrolled or excessive scaling can exhaust cluster resources, destabilise workloads, or lead to unexpected costs. @@ -38,7 +37,7 @@ The policy also accounts for operational governance by controlling the scale act By enforcing this policy, organisations can optimise cluster stability, prevent resource exhaustion, and maintain cost-effective scaling practices, ensuring that both system performance and financial controls are upheld. -### Disallow Service LoadBalancers +### Disallow Service LoadBalancers ([`disallow_service_type_loadBalancer.yaml`](https://github.com/jetstack/finops-stack/blob/main/charts/finops-policies/templates/disallow_service_type_loadBalancer.yaml)) In Kubernetes, the LoadBalancer service type provides a mechanism to expose services externally to the internet by automatically provisioning an external load balancer. While this is a convenient method to provide access to applications, improper or excessive use of LoadBalancer services can lead to unintended consequences, such as increased cloud costs, security risks, and resource consumption. @@ -46,7 +45,7 @@ This policy enforces governance around the creation and usage of services with t By applying this policy, organisations can maintain tighter control over network traffic management, ensuring that only approved applications can utilise external load balancers, and promoting the use of alternative, cost-effective service types such as ClusterIP or NodePort where external access is not required. The policy also supports FinOps principles by enforcing the responsible use of cloud resources and minimising operational costs related to networking services. -### Add HPA to Deployments/StatefulSets +### Add HPA to Deployments/StatefulSets ([`autoscaler_policies\*.yaml`](https://github.com/jetstack/finops-stack/tree/main/charts/finops-policies/templates/autoscaler_policies)) Horizontal Pod Autoscalers (HPA) are critical components in Kubernetes for maintaining application performance and resource efficiency by automatically adjusting the number of pod replicas in response to varying workloads. A Kyverno policy can automate the generation of HPA resources for each deployment, ensuring that applications are able to handle sudden spikes in traffic without downtime, while also scaling down during periods of low demand to conserve resources. @@ -56,7 +55,7 @@ Additionally, the policy supports cost-efficiency by ensuring that resources are This Kyverno policy plays a crucial role in balancing application reliability, performance, and cost management, especially in dynamic cloud environments where traffic and resource requirements can fluctuate unpredictably. -### Prevent Naked Pods +### Prevent Naked Pods ([`prevent_orphan_pods.yaml`](https://github.com/jetstack/finops-stack/blob/main/charts/finops-policies/templates/prevent_orphan_pods.yaml)) Pods that are not managed by higher-level workload controllers, such as Deployments, StatefulSets, or DaemonSets, lack the self-healing and scaling capabilities that Kubernetes offers. These “naked” Pods, when deployed directly, are unsuitable for production environments as they do not benefit from automated recovery if a node fails or a pod crashes, nor can they scale up or down based on demand. @@ -65,3 +64,17 @@ This policy enforces a restriction that prevents the creation of such standalone In addition to improving operational stability, the policy promotes best practices in Kubernetes resource management by encouraging developers and operators to use controllers like Deployments for managing pod lifecycles. Workload controllers provide declarative updates, rolling deployments, and the ability to maintain desired state configurations, which are essential features for maintaining robust and scalable applications in modern cloud-native environments. By enforcing this policy, organisations can prevent the accidental deployment of fragile, unmanaged Pods, reducing operational risks and ensuring a more resilient production environment. + +### Various quota policies ([`quota_management_policies\*.yaml`](https://github.com/jetstack/finops-stack/tree/main/charts/finops-policies/templates/quota_management_policies)) + +#### Add namespace quota + +To better control the number of resources that can be created in a given Namespace and provide default resource consumption limits for Pods, ResourceQuota and LimitRange resources are recommended. This policy will generate ResourceQuota and LimitRange resources when a new Namespace is created. + +#### Namespace Inventory Check + +In cases such as multi-tenancy where new Namespaces must be fully provisioned before they can be used, it may not be easy to declare and understand if/when the Namespace is ready. Having a policy which defines all the resources which are required for each Namespace can assist in determining compliance. This policy, expected to be run in background mode only, performs a Namespace inventory check to ensure that all Namespaces have a ResourceQuota and NetworkPolicy. Additional rules may be written to extend the check for your needs. By default, background scans occur every one hour which may be changed with an additional container flag. Please see the installation documentation for details. + +#### Require Limits and Requests + +As application workloads share cluster resources, it is important to limit resources requested and consumed by each Pod. It is recommended to require resource requests and limits per Pod, especially for memory and CPU. If a Namespace level request or limit is specified, defaults will automatically be applied to each Pod based on the LimitRange configuration. This policy validates that all containers have something specified for memory and CPU requests and memory limits. diff --git a/installation/env.tmpl b/installation/env.tmpl index 8525b0f..90719c7 100644 --- a/installation/env.tmpl +++ b/installation/env.tmpl @@ -19,6 +19,6 @@ GRAFANA_INGRESS="false" # GRAFANA_PUBLIC_IP_NAME="name-of-public-ip" # GRAFANA_FQDN="grafana.host.name" -## GCP SA for workload identity for cert-manager (only required if using ingress) -# CERT_MANAGER_SA_ANNOTATION="iam.gke.io/gcp-service-account: cert-manager-sa@my-gcp-project.iam.gserviceaccount.com" -# CERT_MANAGER_EMAIL="issuer@example.com" +## GCP SA for workload identity for cert-manager (need to be defined but only used if cert-manager is being installed) +CERT_MANAGER_SA_ANNOTATION="iam.gke.io/gcp-service-account: cert-manager-sa@my-gcp-project.iam.gserviceaccount.com" +CERT_MANAGER_EMAIL="issuer@example.com" diff --git a/mkdocs.yaml b/mkdocs.yaml index c06ce07..78c3831 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -17,13 +17,15 @@ extra: nav: - Home: index.md - - Introduction: + - About: - Overview: intro.md - Architecture: architecture.md - Policies: policies.md - - Working with GKE Autopilot: distribution-gke-autopilot.md + - Dashboards: dashboards.md - Installation: - install.md + - Distribution notes: + - distribution-gke-autopilot.md - credit.md extra_css: @@ -79,6 +81,7 @@ plugins: - git-revision-date-localized: type: date enable_creation_date: true + fallback_to_build_date: true - minify: minify_html: true - include-markdown