diff --git a/README.md b/README.md index 9d6a9b5..7fbaa7e 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ Because metrics datas retrieved from federated prometheuses servers must have th |cadvisor|Kubernetes|helm||Install cadvisor for in-depth metrics in Kubernetes workloads.| |elasticsearch-exporter|VM, instances, etc.|ansible||Install elasticsearch-exporter on classic instance or virtual machines| |elasticsearch-exporter|Kubernetes|helm||Install elasticsearch-exporter within Kubernetes.| +|apache-exporter|VM, instances, etc.|ansible||Install apache-exporter on classic instance or virtual machines.| + ## Variables @@ -46,7 +48,7 @@ A list of clients to federate. Each client in `clients` has the following variab - `username` : specify the username for basic_auth to request the endpoint. - `password` : specify the password for basic_auth to request the endpoint. - `kubernetes_hosted`: specify if the prometheus endpoint is hosted on Kubernetes. If `true`, then it will also retrieve metrics data and create dashboards for this Kubernetes cluster. -- `products`: a list of products the client has. It means the specified federated prometheus actually has metrics data for these products. For now, it only allows `node`, `kubernetes`, and `elasticsearch`. +- `products`: a list of products the client has. It means the specified federated prometheus actually has metrics data for these products. For now, it only allows `node`, `kubernetes`, `elasticsearch` and `apache`. - `prometheus_rules`: a list of custom prometheus rules based on [Prometheus 2.0 documentation](https://prometheus.io/docs/prometheus/latest/configuration/template_examples/), to create for this client. Please note to use `!unsafe` keyword as prefix for every string that uses dollar sign `$` to avoid templating. By default, `prometheus_rules` is empty and [standard rules](https://github.com/scalair/tethys/blob/dev/templates/prometheus/client.rules.j2) will be automatically applied for each defined `products`. _Example usage:_ diff --git a/tasks/generate-dashboards.yml b/tasks/generate-dashboards.yml index 16593aa..b5e9e84 100644 --- a/tasks/generate-dashboards.yml +++ b/tasks/generate-dashboards.yml @@ -36,3 +36,11 @@ loop: "{{ client.prometheus_federation }}" loop_control: loop_var: clusterID + +- name: generate-dashboards| apache + template: + src: dashboards/apache.json.j2 + dest: "{{ tmpdir.path }}/grafana/{{ client.name }}-apache.json" + mode: 0640 + when: "'apache' in client.products" + diff --git a/tasks/generate-rules.yml b/tasks/generate-rules.yml index 42f4943..f03095c 100644 --- a/tasks/generate-rules.yml +++ b/tasks/generate-rules.yml @@ -21,7 +21,7 @@ tags: - generate-rules -- name: generate-rules| rules files directory +- name: generate-rules| show rules files directory debug: msg: "Prometheus rules directory : {{ tmpdir.path }}/prometheus/" tags: diff --git a/templates/dashboards/apache.json.j2 b/templates/dashboards/apache.json.j2 new file mode 100644 index 0000000..b50ab6c --- /dev/null +++ b/templates/dashboards/apache.json.j2 @@ -0,0 +1,782 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": 3894, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "1m", + "rows": [ + { + "collapse": false, + "height": 102, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 1, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "s", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apache_uptime_seconds_total{clientID=\"$clientID\", instance=~\"$host:$port\"}", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fill": 7, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 3, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Apache Down", + "color": "#BF1B00" + }, + { + "alias": "Apache Down", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "span": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count(apache_up{clientID=\"$clientID\", instance=~\"$host:$port\"} == 1)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Apache Up", + "refId": "A", + "step": 120 + }, + { + "expr": "scalar(count(apache_up{clientID=\"$clientID\", instance=~\"$host:$port\"} == 0))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Apache Down", + "refId": "B", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Apache Up / Down", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": "1", + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Basic Apache status", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "365", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 1, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(apache_sent_kilobytes_total{clientID=\"$clientID\", instance=~\"$host:$port\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Kilobytes Sent", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Current total kbytes sent", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "deckbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 1, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(apache_accesses_total{clientID=\"$clientID\", instance=~\"$host:$port\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Accesses", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Current total apache accesses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Traffic Load", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "365", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 1, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "apache_scoreboard{clientID=\"$clientID\", instance=~\"$host:$port\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ '{{ state }}' }}", + "refId": "A", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Apache scoreboard statuses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Scoreboard", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "365", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 1, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "apache_workers{clientID=\"$clientID\", instance=~\"$host:$port\"}\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ '{{ state }}' }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Apache worker statuses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 1, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "apache_cpuload{clientID=\"$clientID\", instance=~\"$host:$port\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Load", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Apache CPU load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Processing Load", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "apache", + "{{ client.name }}" + ], + "templating": { + "list": [ + { + "current": { + "text": "No data sources found", + "value": "" + }, + "hide": 2, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "host", + "multi": false, + "name": "host", + "options": [], + "query": "label_values(apache_up{clientID=\"$clientID\"}, instance)", + "refresh": 2, + "regex": "/([^:]+):.*/", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "port", + "options": [], + "query": "label_values(apache_up{clientID=\"$clientID\"}, instance)", + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "{{ client.name }}", + "value": "{{ client.name }}" + }, + "error": null, + "hide": 2, + "label": null, + "name": "clientID", + "options": [ + { + "selected": true, + "text": "{{ client.name }}", + "value": "{{ client.name }}" + } + ], + "query": "{{ client.name }}", + "skipUrlSync": false, + "type": "constant" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Apache - {{ client.name }}", + "version": 1, + "description": "Apache Web Server dashboard." +} \ No newline at end of file diff --git a/templates/prometheus/client.rules.j2 b/templates/prometheus/client.rules.j2 index 7e77d33..fd1ee82 100644 --- a/templates/prometheus/client.rules.j2 +++ b/templates/prometheus/client.rules.j2 @@ -567,3 +567,56 @@ groups: labels: severity: critical {% endif %} + +{% if 'apache' in client.products %} +- name: "{{ client.name }}-apache" + rules: + - alert: ApacheDown + annotations: + description: '{% raw %}Apache down on {{ $labels.instance }}. Job {{ $labels.job }}.{% endraw %}' + summary: "{% raw %}Apache down (instance {{ $labels.instance }}){% endraw %}" + expr: '{% raw %}apache_up{clientID="{% endraw %}{{ client.name }}{% raw %}"} == 0{% endraw %}' + for: 0m + labels: + severity: critical + - alert: ApacheRestart + expr: '{% raw %}apache_uptime_seconds_total{clientID="{% endraw %}{{ client.name }}{% raw %}"} / 60 < 1{% endraw %}' + for: 0m + labels: + severity: warning + annotations: + summary: "{% raw %}Apache restart (instance {{ $labels.instance }}){% endraw %}" + description: "{% raw %}Apache has just been restarted on instance {{ $labels.instance }}.{% endraw %}" + - alert: ApacheWorkersLoad + expr: '{% raw %}(sum by (instance) (apache_workers{state="busy",clientID="{% endraw %}{{ client.name }}{% raw %}"}) / sum by (instance) (apache_scoreboard{clientID="{% endraw %}{{ client.name }}{% raw %}"}) ) * 100 > 80{% endraw %}' + for: 5m + labels: + severity: warning + annotations: + summary: "{% raw %}Apache workers load (instance {{ $labels.instance }}){% endraw %}" + description: "{% raw %}Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }} for more than 5m{% endraw %}" + - alert: ApacheWorkersLoad + expr: '{% raw %}(sum by (instance) (apache_workers{state="busy",clientID="{% endraw %}{{ client.name }}{% raw %}"}) / sum by (instance) (apache_scoreboard{clientID="{% endraw %}{{ client.name }}{% raw %}"}) ) * 100 > 90{% endraw %}' + for: 1h + labels: + severity: critcal + annotations: + summary: "{% raw %}Apache workers load (instance {{ $labels.instance }}){% endraw %}" + description: "{% raw %}Apache workers in busy state approach the max workers count 90% workers busy on {{ $labels.instance }} for more than 1h{% endraw %}" + - alert: ApacheFileDescriptor + expr: '{% raw %}(sum by (instance) (process_open_fds{clientID="{% endraw %}{{ client.name }}{% raw %}",job="apache-exporter"}) / sum by (instance) (process_max_fds{clientID="{% endraw %}{{ client.name }}{% raw %}",job="apache-exporter"}) ) * 100 > 80{% endraw %}' + for: 5m + labels: + severity: warning + annotations: + summary: "{% raw %}Apache file descriptor usage (instance {{ $labels.instance }}){% endraw %}" + description: "{% raw %}Apache used file descriptor approach 80% of the max file descriptor on {{ $labels.instance }}.{% endraw %}" + - alert: ApacheFileDescriptor + expr: '{% raw %}(sum by (instance) (process_open_fds{clientID="{% endraw %}{{ client.name }}{% raw %}",job="apache-exporter"}) / sum by (instance) (process_max_fds{clientID="{% endraw %}{{ client.name }}{% raw %}",job="apache-exporter"}) ) * 100 > 90{% endraw %}' + for: 30m + labels: + severity: critical + annotations: + summary: "{% raw %}Apache file descriptor usage (instance {{ $labels.instance }}){% endraw %}" + description: "{% raw %}Apache used file descriptor approach 90% of the max file descriptor on {{ $labels.instance }} for more than 30m.{% endraw %}" +{% endif %} diff --git a/templates/prometheus/playbook.yml.j2 b/templates/prometheus/playbook.yml.j2 index 9b04d29..3331556 100644 --- a/templates/prometheus/playbook.yml.j2 +++ b/templates/prometheus/playbook.yml.j2 @@ -45,6 +45,9 @@ {% if 'elasticsearch' in client.products %} - '{job="elasticsearch-exporter"}' {% endif %} +{% if 'apache' in client.products %} + - '{job="apache-exporter"}' +{% endif %} {% else %} params: 'match[]':