diff --git a/apps/fluentd.yaml b/apps/fluentd.yaml new file mode 100644 index 00000000..efe643ca --- /dev/null +++ b/apps/fluentd.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: App +name: "Fluentd" +keywords: + - Observability + - Logging + - Available +availableVersions: + - '1.12.4' +shortDescription: "Fluentd is an open source data collector for unified logging layer." +description: | + Fluentd is an open source data collector, which lets you unify the data collection and consumption for a better use and understanding of data. +icon: https://raw.githubusercontent.com/sysdiglabs/promcat-resources/master/apps/images/fluentd.png +website: https://www.fluentd.org/ +available: true \ No newline at end of file diff --git a/apps/images/fluentd.png b/apps/images/fluentd.png new file mode 100644 index 00000000..0f0b7c21 Binary files /dev/null and b/apps/images/fluentd.png differ diff --git a/apps/images/ntp.png b/apps/images/ntp.png new file mode 100644 index 00000000..a94f19cd Binary files /dev/null and b/apps/images/ntp.png differ diff --git a/apps/ntp.yaml b/apps/ntp.yaml new file mode 100644 index 00000000..6b0bb4f4 --- /dev/null +++ b/apps/ntp.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: App +name: "NTP" +keywords: + - Network + - Available +availableVersions: + - '4' +shortDescription: "The Network Time Protocol (NTP) is a networking protocol for clock synchronization between computer systems" +description: | + The Network Time Protocol (NTP) is a networking protocol for clock synchronization between computer systems over packet-switched, variable-latency data networks. In operation since before 1985, NTP is one of the oldest Internet protocols in current use. NTP was designed by David L. Mills of the University of Delaware. +icon: https://raw.githubusercontent.com/sysdiglabs/promcat-resources/master/apps/images/ntp.png +website: http://www.ntp.org/ +available: yes \ No newline at end of file diff --git a/resources/fluentd/ALERTS.md b/resources/fluentd/ALERTS.md new file mode 100644 index 00000000..27b5e938 --- /dev/null +++ b/resources/fluentd/ALERTS.md @@ -0,0 +1,28 @@ +# Alerts +## No Input From Container +No Input From Container. + +## High Error Ratio +High Error Ratio. + +## High Retry Ratio +High Retry Ratio. + +## High Retry Wait +High Retry Wait. + +## Low Buffer Available Space +Low Buffer Available Space. + +## Buffer Queue Length Increasing +Buffer Queue Length Increasing. + +## Buffer Total Bytes Increasing +Buffer Total Bytes Increasing. + +## High Slow Flush Ratio +High Slow Flush Ratio. + +## No Output Records From Plugin +No Output Records From Plugin. + diff --git a/resources/fluentd/INSTALL.md b/resources/fluentd/INSTALL.md new file mode 100644 index 00000000..e4004df9 --- /dev/null +++ b/resources/fluentd/INSTALL.md @@ -0,0 +1,30 @@ +# Prerequisites +Fluentd instruments Prometheus metrics and annotates the pods with Prometheus annotations. + +For Fluentd to expose Prometheus metrics, the following plugins need to be enabled: +- 'prometheus' input plugin +- 'prometheus_monitor' input plugin +- 'prometheus_output_monitor' input plugin + +As seen in the official plugin documentation (https://github.com/fluent/fluent-plugin-prometheus/blob/master/README.md), they can be enabled with the following configurations: +``` + + @type prometheus + @id in_prometheus + bind "0.0.0.0" + port 24231 + metrics_path "/metrics" + + + + @type prometheus_monitor + @id in_prometheus_monitor + + + + @type prometheus_output_monitor + @id in_prometheus_output_monitor + +``` + +If you are deploying Fluentd using the official Helm chart (https://github.com/fluent/helm-charts/tree/main/charts/fluentd), it already has these plugins enabled by default in its configuration, so no additional actions are needed. \ No newline at end of file diff --git a/resources/fluentd/README.md b/resources/fluentd/README.md new file mode 100644 index 00000000..43ecb243 --- /dev/null +++ b/resources/fluentd/README.md @@ -0,0 +1,12 @@ +# Fluentd +Fluentd is an open source data collector, which lets you unify the data collection and consumption for a better use and understanding of data. + + +# Prometheus and exporters +Fluentd already has a Prometheus endpoint with all the metrics exposed on the port 24231. In Kubernetes the pod is already annotated, so with the Sysdig agent you can scrape the endpoint right away. + +# Metrics +- Fluentd internal statistics + +# Attributions +Configuration files, dashboards and alerts are maintained by [Sysdig team](https://sysdig.com/). \ No newline at end of file diff --git a/resources/fluentd/alerts.yaml b/resources/fluentd/alerts.yaml new file mode 100644 index 00000000..8f336beb --- /dev/null +++ b/resources/fluentd/alerts.yaml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: Alert +app: Fluentd +version: 1.0.0 +appVersion: +- '1.12.4' +descriptionFile: ALERTS.md +configurations: +- kind: Prometheus + data: |- + groups: + - name: Fluentd + rules: + - alert: '[Fluentd] No Input From Container' + expr: | + sum by (input_namespace, input_container)(rate(fluentd_input_status_num_records_total[5m])) == 0 + for: 5m + labels: + severity: warning + annotations: + description: No Input From Container. + - alert: '[Fluentd] High Error Ratio' + expr: | + sum by (type, plugin_id)(rate(fluentd_output_status_num_errors[5m])) /sum by (type, plugin_id)(rate(fluentd_output_status_emit_count[5m]))> 0.05 + for: 5m + labels: + severity: critical + annotations: + description: High Error Ratio. + - alert: '[Fluentd] High Retry Ratio' + expr: | + sum by (type, plugin_id)(rate(fluentd_output_status_retry_count[5m])) /sum by (type, plugin_id)(rate(fluentd_output_status_emit_count[5m]))> 0.05 + for: 5m + labels: + severity: critical + annotations: + description: High Retry Ratio. + - alert: '[Fluentd] High Retry Wait' + expr: | + sum by (type, plugin_id)(max_over_time(fluentd_output_status_retry_wait[5m])) > 60 + for: 5m + labels: + severity: critical + annotations: + description: High Retry Wait. + - alert: '[Fluentd] Low Buffer Available Space' + expr: | + fluentd_output_status_buffer_available_space_ratio < 10 + for: 5m + labels: + severity: warning + annotations: + description: Low Buffer Available Space. + - alert: '[Fluentd] Buffer Queue Length Increasing' + expr: | + avg_over_time(fluentd_output_status_buffer_queue_length[5m]) - avg_over_time(fluentd_output_status_buffer_queue_length[5m] offset 5m)> 0 + for: 5m + labels: + severity: warning + annotations: + description: Buffer Queue Length Increasing. + - alert: '[Fluentd] Buffer Total Bytes Increasing' + expr: | + avg_over_time(fluentd_output_status_buffer_total_bytes[5m]) - avg_over_time(fluentd_output_status_buffer_total_bytes[5m] offset 5m)> 0 + for: 15m + labels: + severity: warning + annotations: + description: Buffer Total Bytes Increasing. + - alert: '[Fluentd] High Slow Flush Ratio' + expr: | + sum by (type, plugin_id)(rate(fluentd_output_status_slow_flush_count[5m])) /sum by (type, plugin_id)(rate(fluentd_output_status_emit_count[5m]))> 0.05 + for: 5m + labels: + severity: warning + annotations: + description: High Slow Flush Ratio. + - alert: '[Fluentd] No Output Records From Plugin' + expr: | + rate(fluentd_output_status_emit_records[5m]) == 0 + for: 5m + labels: + severity: warning + annotations: + description: No Output Records From Plugin. \ No newline at end of file diff --git a/resources/fluentd/dashboards.yaml b/resources/fluentd/dashboards.yaml new file mode 100644 index 00000000..60bf6c51 --- /dev/null +++ b/resources/fluentd/dashboards.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Dashboard +app: Fluentd +version: 1.0.0 +appVersion: +- '1.12.4' +configurations: +- name: Fluentd + kind: Sysdig + image: fluentd/images/fluentd.png + description: | + This dashboard offers information on: + * Input/Output + * Buffer + * Flush + file: include/Fluentd.json \ No newline at end of file diff --git a/resources/fluentd/description.yaml b/resources/fluentd/description.yaml new file mode 100644 index 00000000..8f35ca18 --- /dev/null +++ b/resources/fluentd/description.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Description +app: Fluentd +version: 1.0.0 +appVersion: +- '1.12.4' +descriptionFile: README.md diff --git a/resources/fluentd/images/fluentd.png b/resources/fluentd/images/fluentd.png new file mode 100644 index 00000000..dfd02dd4 Binary files /dev/null and b/resources/fluentd/images/fluentd.png differ diff --git a/resources/fluentd/include/Fluentd.json b/resources/fluentd/include/Fluentd.json new file mode 100644 index 00000000..bd7d130e --- /dev/null +++ b/resources/fluentd/include/Fluentd.json @@ -0,0 +1,1615 @@ +{ + "dashboard": { + "description": "", + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "alertStatuses": [], + "categories": [], + "filter": "", + "severities": [], + "teamScope": false + } + }, + "layout": [ + { + "h": 2, + "panelId": 7, + "w": 24, + "x": 0, + "y": 0 + }, + { + "h": 1, + "panelId": 8, + "w": 24, + "x": 0, + "y": 35 + }, + { + "h": 1, + "panelId": 12, + "w": 24, + "x": 0, + "y": 52 + }, + { + "h": 1, + "panelId": 19, + "w": 24, + "x": 0, + "y": 2 + }, + { + "h": 8, + "panelId": 1, + "w": 12, + "x": 0, + "y": 19 + }, + { + "h": 8, + "panelId": 5, + "w": 12, + "x": 0, + "y": 11 + }, + { + "h": 8, + "panelId": 3, + "w": 12, + "x": 0, + "y": 3 + }, + { + "h": 8, + "panelId": 9, + "w": 12, + "x": 0, + "y": 36 + }, + { + "h": 8, + "panelId": 10, + "w": 12, + "x": 12, + "y": 36 + }, + { + "h": 8, + "panelId": 4, + "w": 12, + "x": 12, + "y": 3 + }, + { + "h": 8, + "panelId": 13, + "w": 8, + "x": 0, + "y": 53 + }, + { + "h": 8, + "panelId": 14, + "w": 8, + "x": 8, + "y": 53 + }, + { + "h": 8, + "panelId": 15, + "w": 8, + "x": 16, + "y": 53 + }, + { + "h": 8, + "panelId": 17, + "w": 12, + "x": 0, + "y": 44 + }, + { + "h": 8, + "panelId": 11, + "w": 12, + "x": 12, + "y": 44 + }, + { + "h": 8, + "panelId": 6, + "w": 12, + "x": 12, + "y": 11 + }, + { + "h": 8, + "panelId": 2, + "w": 12, + "x": 12, + "y": 19 + }, + { + "h": 8, + "panelId": 16, + "w": 12, + "x": 12, + "y": 27 + }, + { + "h": 8, + "panelId": 18, + "w": 12, + "x": 0, + "y": 27 + } + ], + "name": "Fluentd", + "panels": [ + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Error: Cluster", + "timeSeriesDisplayNameTemplate": "{{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name)(rate(fluentd_output_status_num_errors{kube_cluster_name=~$cluster}[$__interval]))" + }, + { + "displayInfo": { + "displayName": "Retry: Cluster", + "timeSeriesDisplayNameTemplate": "{{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name)(rate(fluentd_output_status_retry_count{kube_cluster_name=~$cluster}[$__interval]))" + }, + { + "displayInfo": { + "displayName": "Rollback: Cluster", + "timeSeriesDisplayNameTemplate": "{{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name)(rate(fluentd_output_status_rollback_count{kube_cluster_name=~$cluster}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 1, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Total Output Error/Retry/Rollback rate", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Cluster", + "timeSeriesDisplayNameTemplate": "{{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name)(rate(fluentd_output_status_emit_records{kube_cluster_name=~$cluster}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 5, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Total Output rate", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Cluster", + "timeSeriesDisplayNameTemplate": "{{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name)(rate(fluentd_input_status_num_records_total{kube_cluster_name=~$cluster}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 3, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Total Input rate", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "description": "", + "id": 7, + "markdownSource": "# Scope details\n\ud83d\udc49 The 'input_namespace' , 'input_pod' and 'input_container' scopes can be used in the 'Input' panels to filter the sources of the logs.\n\n\ud83d\udc49 The 'output_type' and 'output_plugin_id' scopes allows filtering by 'Output' in the rest of the panels. More info about 'Outputs' in the **[Fluentd official documentation](https://docs.fluentd.org/output)**.", + "name": "Scope details", + "nullValueDisplayText": null, + "panelTitleVisible": false, + "textAutosized": false, + "transparentBackground": false, + "type": "text" + }, + { + "description": "", + "id": 8, + "markdownSource": "# Buffer", + "name": "Buffer", + "nullValueDisplayText": null, + "panelTitleVisible": false, + "textAutosized": false, + "transparentBackground": false, + "type": "text" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "1", + "nullValueDisplayMode": "nullGap", + "unit": "number", + "yAxis": "auto" + }, + "query": "sum by(kube_cluster_name, type, plugin_id)(max_over_time(fluentd_output_status_buffer_queue_length{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "If these values are increasing, it means Fluentd cannot flush the buffer to the destination. You will lose the data once the buffer becomes full.", + "id": 9, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Maximum Buffer Queue Length", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "B", + "nullValueDisplayMode": "nullGap", + "unit": "byte", + "yAxis": "auto" + }, + "query": "sum by(kube_cluster_name, type, plugin_id)(max_over_time(fluentd_output_status_buffer_total_bytes{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "B", + "maxValue": null, + "minInputFormat": "B", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "If these values are increasing, it means Fluentd cannot flush the buffer to the destination. You will lose the data once the buffer becomes full.", + "id": 10, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Maximum Buffer Bytes\n", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Container < Pod < Namespace< Cluster", + "timeSeriesDisplayNameTemplate": "{{input_container}} < {{input_pod}} < {{input_namespace}} < {{kube_cluster_name}}", + "type": "stackedArea" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "topk(50, sum by (kube_cluster_name, input_namespace, input_pod, input_container)(rate(fluentd_input_status_num_records_total{kube_cluster_name=~$cluster, input_namespace=~$input_namespace, input_pod=~$input_pod, input_container=~$input_container}[$__interval]))> 0) " + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 4, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Top 50 Input rate per container", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "description": "", + "id": 12, + "markdownSource": "# Flush", + "name": "Flush", + "nullValueDisplayText": null, + "panelTitleVisible": false, + "textAutosized": false, + "transparentBackground": false, + "type": "text" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "ms", + "nullValueDisplayMode": "nullGap", + "unit": "relativeTime", + "yAxis": "auto" + }, + "query": "sum by(kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_flush_time_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "ns", + "maxValue": null, + "minInputFormat": "ns", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 13, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Flush time\n", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "0-100", + "nullValueDisplayMode": "nullGap", + "unit": "%", + "yAxis": "auto" + }, + "query": "sum by(kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_slow_flush_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))\n / \nsum by(kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_emit_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "0-100", + "maxValue": null, + "minInputFormat": "0-100", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 14, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Slow Flush rate", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "stackedArea" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by(kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_emit_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 15, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Flushes per second", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Pod < Namespace < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} <{{type}} < {{kube_pod_name}} < {{kube_namespace_name}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "1", + "nullValueDisplayMode": "nullGap", + "unit": "number", + "yAxis": "auto" + }, + "query": "sum by(kube_cluster_name, kube_namespace_name, kube_pod_name, type, plugin_id)(max_over_time(fluentd_output_status_buffer_queue_length{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "If these values are increasing, it means Fluentd cannot flush the buffer to the destination. You will lose the data once the buffer becomes full.", + "id": 17, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Maximum Buffer Queue Length per instance\n", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Pod < Namespace < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} <{{type}} < {{kube_pod_name}} < {{kube_namespace_name}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "0-100", + "nullValueDisplayMode": "nullGap", + "unit": "%", + "yAxis": "auto" + }, + "query": "min by(kube_cluster_name, kube_namespace_name, kube_pod_name, type, plugin_id)(min_over_time(fluentd_output_status_buffer_available_space_ratio{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "0-100", + "maxValue": null, + "minInputFormat": "0-100", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 11, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Buffer Available Space Ratio per instance", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "stackedArea" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "topk(50, sum by (kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_emit_records{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval])))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 6, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Top 50 Output rate per plugin", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Error: Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_num_errors{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + }, + { + "displayInfo": { + "displayName": "Retry: Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_retry_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + }, + { + "displayInfo": { + "displayName": "Rollback: Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "/s", + "nullValueDisplayMode": "nullGap", + "unit": "numberRate", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_rollback_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "/s", + "maxValue": null, + "minInputFormat": "/s", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 2, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Output Error/Retry/Rollback rate per plugin", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} <{{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "s", + "nullValueDisplayMode": "nullGap", + "unit": "relativeTime", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name, type, plugin_id)(max_over_time(fluentd_output_status_retry_wait{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "ns", + "maxValue": null, + "minInputFormat": "ns", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "Maximum retry_wait computed from last retry time and next retry time.", + "id": 16, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Output Retry Wait per plugin", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Plugin < Type < Cluster", + "timeSeriesDisplayNameTemplate": "{{plugin_id}} < {{type}} < {{kube_cluster_name}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "0-100", + "nullValueDisplayMode": "nullGap", + "unit": "%", + "yAxis": "auto" + }, + "query": "sum by (kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_num_errors{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval])) \n/ \nsum by(kube_cluster_name, type, plugin_id)(rate(fluentd_output_status_emit_count{kube_cluster_name=~$cluster, type=~$output_type, plugin_id=~$output_plugin_id}[$__interval]))" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "0-100", + "maxValue": null, + "minInputFormat": "0-100", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 18, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "Output Error ratio per plugin", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "description": "", + "id": 19, + "markdownSource": "# Input / Output", + "name": "Input / Output (2)", + "nullValueDisplayText": null, + "panelTitleVisible": false, + "textAutosized": false, + "transparentBackground": false, + "type": "text" + } + ], + "publicNotation": false, + "schema": 3, + "scopeExpressionList": [ + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": true, + "canMonitor": false, + "deferred": false, + "description": "kubernetes.cluster.name", + "documentId": "kubernetes.cluster.name", + "documentTimestamp": 1653489054073, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "kubernetes.cluster.name", + "identity": false, + "metricType": "tag", + "name": "kubernetes.cluster.name", + "namespaces": [ + "kubernetes.cluster" + ], + "publicId": "kube_cluster_name", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "cluster", + "isVariable": true, + "operand": "kubernetes.cluster.name", + "operator": "in", + "value": [], + "variable": true + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": false, + "canMonitor": false, + "deferred": false, + "description": "input_namespace", + "documentId": "input_namespace", + "documentTimestamp": 1653489054073, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "input_namespace", + "identity": false, + "metricType": "tag", + "name": "input_namespace", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "input_namespace", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "input_namespace", + "isVariable": true, + "operand": "input_namespace", + "operator": "in", + "value": [], + "variable": true + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": false, + "canMonitor": false, + "deferred": false, + "description": "input_pod", + "documentId": "input_pod", + "documentTimestamp": 1653489054073, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "input_pod", + "identity": false, + "metricType": "tag", + "name": "input_pod", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "input_pod", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "input_pod", + "isVariable": true, + "operand": "input_pod", + "operator": "in", + "value": [], + "variable": true + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": false, + "canMonitor": false, + "deferred": false, + "description": "input_container", + "documentId": "input_container", + "documentTimestamp": 1653489054074, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "input_container", + "identity": false, + "metricType": "tag", + "name": "input_container", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "input_container", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "input_container", + "isVariable": true, + "operand": "input_container", + "operator": "in", + "value": [], + "variable": true + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": false, + "canMonitor": false, + "deferred": false, + "description": "type", + "documentId": "type", + "documentTimestamp": 1653489054074, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "type", + "identity": false, + "metricType": "tag", + "name": "type", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "type", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "output_type", + "isVariable": true, + "operand": "type", + "operator": "in", + "value": [], + "variable": true + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": false, + "canMonitor": false, + "deferred": false, + "description": "plugin_id", + "documentId": "plugin_id", + "documentTimestamp": 1653489054074, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "plugin_id", + "identity": false, + "metricType": "tag", + "name": "plugin_id", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "plugin_id", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "output_plugin_id", + "isVariable": true, + "operand": "plugin_id", + "operator": "in", + "value": [], + "variable": true + } + ] + } +} \ No newline at end of file diff --git a/resources/fluentd/include/sysdig-agent.yaml b/resources/fluentd/include/sysdig-agent.yaml new file mode 100644 index 00000000..693f09e4 --- /dev/null +++ b/resources/fluentd/include/sysdig-agent.yaml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: sysdig-agent + namespace: sysdig-agent +data: + dragent.yaml: | + new_k8s: true + k8s_cluster_name: YourClusterName + metrics_excess_log: true + 10s_flush_enable: true + app_checks_enabled: false + use_promscrape: true + new_k8s: true + promscrape_fastproto: true + prometheus: + enabled: true + prom_service_discovery: true + log_errors: true + max_metrics: 200000 + max_metrics_per_process: 200000 + max_tags_per_metric: 100 + ingest_raw: true + ingest_calculated: false + snaplen: 512 + tags: role:cluster + prometheus.yaml: | + global: + scrape_interval: 10s + scrape_configs: + - job_name: 'fluentd-default' + tls_config: + insecure_skip_verify: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + source_labels: [__meta_kubernetes_pod_host_ip] + regex: __HOSTIPS__ + - action: drop + source_labels: [__meta_kubernetes_pod_annotation_promcat_sysdig_com_omit] + regex: true + - action: replace + source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] + target_label: __scheme__ + regex: (https?) + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + - __meta_kubernetes_pod_annotation_promcat_sysdig_com_integration_type + regex: (fluentd);(.{0}$) + replacement: fluentd + target_label: __meta_kubernetes_pod_annotation_promcat_sysdig_com_integration_type + - action: keep + source_labels: + - __meta_kubernetes_pod_annotation_promcat_sysdig_com_integration_type + regex: "fluentd" + - action: replace + source_labels: [__meta_kubernetes_pod_uid] + target_label: sysdig_k8s_pod_uid + - action: replace + source_labels: [__meta_kubernetes_pod_container_name] + target_label: sysdig_k8s_pod_container_name + metric_relabel_configs: + - action: replace + source_labels: + - __name__ + - tag + regex: fluentd_input_status_num_records_total;kubernetes.var.log.containers.([a-zA-Z0-9 \d\.-]+)_([a-zA-Z0-9 \d\.-]+)_([a-zA-Z0-9 \d\.-]+)-[a-zA-Z0-9]+.log + target_label: input_pod + replacement: $1 + - action: replace + source_labels: + - __name__ + - tag + regex: fluentd_input_status_num_records_total;kubernetes.var.log.containers.([a-zA-Z0-9 \d\.-]+)_([a-zA-Z0-9 \d\.-]+)_([a-zA-Z0-9 \d\.-]+)-[a-zA-Z0-9]+.log + target_label: input_namespace + replacement: $2 + - action: replace + source_labels: + - __name__ + - tag + regex: fluentd_input_status_num_records_total;kubernetes.var.log.containers.([a-zA-Z0-9 \d\.-]+)_([a-zA-Z0-9 \d\.-]+)_([a-zA-Z0-9 \d\.-]+)-[a-zA-Z0-9]+.log + target_label: input_container + replacement: $3 \ No newline at end of file diff --git a/resources/fluentd/setup-guide.yaml b/resources/fluentd/setup-guide.yaml new file mode 100644 index 00000000..f150b5aa --- /dev/null +++ b/resources/fluentd/setup-guide.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: SetupGuide +app: Fluentd +version: 1.0.0 +appVersion: +- '1.12.4' +configurations: +- name: sysdig-agent.yaml + file: include/sysdig-agent.yaml +descriptionFile: INSTALL.md diff --git a/resources/ntp/ALERTS.md b/resources/ntp/ALERTS.md new file mode 100644 index 00000000..5b3338db --- /dev/null +++ b/resources/ntp/ALERTS.md @@ -0,0 +1,3 @@ +# Alerts +## Drift is too high +The time drift is more than 10s diff --git a/resources/ntp/INSTALL.md b/resources/ntp/INSTALL.md new file mode 100644 index 00000000..1261adb2 --- /dev/null +++ b/resources/ntp/INSTALL.md @@ -0,0 +1,12 @@ +# Installing the exporter +To install the [NTP exporter](https://github.com/sapcc/ntp_exporter) below there is an example of a deployment: + + +En the args, just replace the server for the one you are using + +``` +args: [ + "-ntp.server", "Your-ntp-server" +] +``` + diff --git a/resources/ntp/README.md b/resources/ntp/README.md new file mode 100644 index 00000000..fc45be54 --- /dev/null +++ b/resources/ntp/README.md @@ -0,0 +1,16 @@ +# NTP +[NTP](http://www.ntp.org/) The Network Time Protocol (NTP) is a networking protocol for clock synchronization between computer systems over packet-switched, variable-latency data networks. In operation since before 1985, NTP is one of the oldest Internet protocols in current use. NTP was designed by David L. Mills of the University of Delaware. + +To extract metrics you can use the [NTP Exporter](https://github.com/sapcc/ntp_exporter). + +# Metrics +The metrics available are the ones related to the modules: +* Drift + +# Number of time series generated +* Each instance generates ~4 metrics + +# Attributions +The configuration files, dashboards, and alerts maintained by [Sysdig team](https://sysdig.com/). + +Using [NTP Exporter](https://github.com/sapcc/ntp_exporter) with Apache 2.0 license. diff --git a/resources/ntp/alerts.yaml b/resources/ntp/alerts.yaml new file mode 100644 index 00000000..90598c9c --- /dev/null +++ b/resources/ntp/alerts.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Alert +app: NTP +version: 1.0.0 +appVersion: +- '4' +descriptionFile: ALERTS.md +configurations: +- kind: Prometheus + data: |- + groups: + - name: NTP + rules: + - alert: '[NTP] Drift is too high' + expr: "ntp_drift_seconds > 10" + for: 5m + labels: + severity: critical + annotations: + description: Drift is too high. diff --git a/resources/ntp/dashboards.yaml b/resources/ntp/dashboards.yaml new file mode 100644 index 00000000..0656ce19 --- /dev/null +++ b/resources/ntp/dashboards.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Dashboard +app: 'NTP' +version: 1.0.0 +appVersion: +- '4' +configurations: +- name: 'Instance Health' + kind: Sysdig + image: 'ntp/images/ntp.png' + description: | + This dashboard offers information on: + * Drift per node + file: include/ntp.json diff --git a/resources/ntp/description.yaml b/resources/ntp/description.yaml new file mode 100644 index 00000000..84702732 --- /dev/null +++ b/resources/ntp/description.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Description +app: NTP +version: 1.0.0 +appVersion: +- '4' +descriptionFile: README.md diff --git a/resources/ntp/images/ntp.png b/resources/ntp/images/ntp.png new file mode 100644 index 00000000..3a74484b Binary files /dev/null and b/resources/ntp/images/ntp.png differ diff --git a/resources/ntp/include/ntp-deploy.yaml b/resources/ntp/include/ntp-deploy.yaml new file mode 100644 index 00000000..bc0b099f --- /dev/null +++ b/resources/ntp/include/ntp-deploy.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: ntp-exporter + labels: + k8s-app: ntp-exporter +spec: + selector: + matchLabels: + k8s-app: ntp-exporter + template: + metadata: + labels: + k8s-app: ntp-exporter + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: ntp-exporter + image: sapcc/ntp-exporter:v2.0.2 + args: [ + "-ntp.server", "Your-NTP-server" + ] + resources: + limits: + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi \ No newline at end of file diff --git a/resources/ntp/include/ntp.json b/resources/ntp/include/ntp.json new file mode 100644 index 00000000..27333753 --- /dev/null +++ b/resources/ntp/include/ntp.json @@ -0,0 +1,555 @@ +{ + "dashboard": { + "description": "", + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "alertStatuses": [], + "categories": [], + "filter": "", + "severities": [], + "teamScope": false + } + }, + "layout": [ + { + "h": 7, + "panelId": 2, + "w": 24, + "x": 0, + "y": 0 + }, + { + "h": 11, + "panelId": 1, + "w": 24, + "x": 0, + "y": 7 + } + ], + "name": "NTP", + "panels": [ + { + "advancedQueries": [ + { + "displayInfo": { + "displayName": "Node < NTP server", + "timeSeriesDisplayNameTemplate": "{{kube_node_name}} > {{server}}", + "type": "lines" + }, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "s", + "nullValueDisplayMode": "nullGap", + "unit": "relativeTime", + "yAxis": "auto" + }, + "query": "ntp_drift_seconds{_sysdig_datasource=\"agent\",kube_cluster_name=~$cluster,kube_node_name=~$node}" + } + ], + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "ns", + "maxValue": null, + "minInputFormat": "ns", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + }, + "right": { + "decimals": null, + "displayFormat": "auto", + "displayName": null, + "enabled": true, + "maxInputFormat": "1", + "maxValue": null, + "minInputFormat": "1", + "minValue": 0.0, + "scale": "linear", + "unit": "auto" + } + }, + "description": "", + "id": 1, + "legendConfiguration": { + "enabled": true, + "height": null, + "layout": "table", + "position": "bottom", + "showCurrent": true, + "width": null + }, + "name": "NTP drift", + "nullValueDisplayText": null, + "type": "advancedTimechart" + }, + { + "basicQueries": [ + { + "compareTo": { + "delta": 1, + "enabled": false, + "timeFormat": "day" + }, + "displayInfo": { + "displayName": "", + "timeSeriesDisplayNameTemplate": "", + "type": "lines" + }, + "displayedValue": null, + "enabled": true, + "format": { + "decimals": null, + "displayFormat": "auto", + "inputFormat": "s", + "nullValueDisplayMode": "nullGap", + "unit": "relativeTime", + "yAxis": "auto" + }, + "metrics": [ + { + "descriptor": { + "aggregationForGroup": "avg", + "canFilter": false, + "canGroupBy": false, + "canMonitor": false, + "category": "prometheus", + "deferred": false, + "description": "", + "documentId": "prometheus.ntp_drift_seconds", + "documentTimestamp": 1652364230019, + "documentType": "metric", + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.PrometheusRawMetric", + "groupAggregations": [ + "avg", + "sum", + "min", + "max" + ], + "heuristic": false, + "hidden": false, + "id": "ntp_drift_seconds", + "identity": false, + "lastSeen": 1652357943000, + "legacyId": "ntp_drift_seconds", + "metricType": "gauge", + "name": "ntp_drift_seconds", + "namespaces": [ + "host", + "host.process", + "host.container", + "cloudProvider", + "mesos", + "ecs", + "kubernetes.cluster", + "kubernetes.namespace", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.daemonSet", + "kubernetes.service", + "kubernetes.node", + "kubernetes.replicaSet", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.persistentvolume", + "kubernetes.persistentvolumeclaim", + "kubernetes.pod" + ], + "publicId": "ntp_drift_seconds", + "scale": 1000000000.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "avg", + "min", + "max" + ], + "type": "relativeTime" + }, + "groupAggregation": "avg", + "id": "ntp_drift_seconds", + "sorting": null, + "timeAggregation": "avg" + } + ], + "scope": { + "expressions": [], + "extendsDashboardScope": true + }, + "segmentation": { + "direction": "desc", + "labels": [ + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": true, + "canMonitor": false, + "deferred": false, + "description": "kube_cluster_name", + "documentId": "kube_cluster_name", + "documentTimestamp": 1652364230019, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "kube_cluster_name", + "identity": false, + "metricType": "tag", + "name": "kube_cluster_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "kube_cluster_name", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "cluster", + "id": "kube_cluster_name", + "sorting": null + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": true, + "canMonitor": false, + "deferred": false, + "description": "kube_node_name", + "documentId": "kube_node_name", + "documentTimestamp": 1652364230019, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "kube_node_name", + "identity": false, + "metricType": "tag", + "name": "kube_node_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "kube_node_name", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "node", + "id": "kube_node_name", + "sorting": null + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": false, + "canMonitor": false, + "deferred": false, + "description": "server", + "documentId": "server", + "documentTimestamp": 1652364230019, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "server", + "identity": false, + "metricType": "tag", + "name": "server", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "server", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "server", + "id": "server", + "sorting": null + } + ], + "limit": 10 + } + } + ], + "description": "", + "id": 2, + "name": "New Panel", + "nullValueDisplayText": null, + "type": "basicTable" + } + ], + "publicNotation": false, + "schema": 3, + "scopeExpressionList": [ + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": true, + "canMonitor": false, + "deferred": false, + "description": "kube_cluster_name", + "documentId": "kube_cluster_name", + "documentTimestamp": 1652364230019, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "kube_cluster_name", + "identity": false, + "metricType": "tag", + "name": "kube_cluster_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "kube_cluster_name", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "cluster", + "isVariable": true, + "operand": "kube_cluster_name", + "operator": "in", + "value": [], + "variable": true + }, + { + "descriptor": { + "aggregationForGroup": "none", + "canFilter": true, + "canGroupBy": true, + "canMonitor": false, + "deferred": false, + "description": "kube_node_name", + "documentId": "kube_node_name", + "documentTimestamp": 1652364230019, + "documentType": "metric", + "documented": true, + "experimental": false, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "heuristic": false, + "hidden": false, + "id": "kube_node_name", + "identity": false, + "metricType": "tag", + "name": "kube_node_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "publicId": "kube_node_name", + "scale": 0.0, + "scopes": [], + "segment": false, + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "type": "string" + }, + "displayName": "node", + "isVariable": true, + "operand": "kube_node_name", + "operator": "in", + "value": [], + "variable": true + } + ] + } +} \ No newline at end of file diff --git a/resources/ntp/setup-guide.yaml b/resources/ntp/setup-guide.yaml new file mode 100644 index 00000000..8aaa8142 --- /dev/null +++ b/resources/ntp/setup-guide.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: SetupGuide +app: NTP +version: 1.0.0 +appVersion: +- '4' +descriptionFile: INSTALL.md +configurations: +- name: ntp-deploy.yaml + file: include/ntp-deploy.yaml diff --git a/resources/portworx/include/Portworx_Cluster.json b/resources/portworx/include/Portworx_Cluster.json index 353ec9fe..b61234e9 100644 --- a/resources/portworx/include/Portworx_Cluster.json +++ b/resources/portworx/include/Portworx_Cluster.json @@ -237,7 +237,7 @@ "y": 36 } ], - "name": "Portworx Cluster\n", + "name": "Portworx Cluster", "panels": [ { "advancedQueries": [ @@ -312,7 +312,7 @@ ], "description": "", "id": 3, - "name": "Cluster size\n", + "name": "Cluster size", "nullValueDisplayText": null, "numberThresholds": { "base": {