diff --git a/defaults/main.yml b/defaults/main.yml index 6a9a8300..43f0fe7f 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,5 +1,5 @@ --- -prometheus_version: 1.8.0 +prometheus_version: 2.0.0 prometheus_config_dir: /etc/prometheus prometheus_db_dir: /var/lib/prometheus diff --git a/files/rules/alert.rules b/files/rules/alert.rules index af2ab1e2..5a65fc6e 100644 --- a/files/rules/alert.rules +++ b/files/rules/alert.rules @@ -1,95 +1,45 @@ -ALERT InstanceDown - IF up == 0 - FOR 5m - LABELS { severity = "critical" } - ANNOTATIONS { - summary = "Instance {{ $labels.instance }} down", - description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.", - } -ALERT CriticalCPULoad - IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > 96 - FOR 2m - LABELS { severity = "critical" } - ANNOTATIONS { - summary = "Instance {{ $labels.instance }} High CPU load", - description = "{{ $labels.instance }} of job {{ $labels.job }} has Critical High CPU load for more than 1 minutes.", - } -ALERT WarningCPULoad - IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > 90 - FOR 2m - LABELS { severity = "warning" } - ANNOTATIONS { - summary = "Instance {{ $labels.instance }} High CPU load", - description = "{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.", - } -ALERT CriticalMemoryLoad - IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 95 - FOR 5m - LABELS { severity = "critical" } - ANNOTATIONS { - summary = "Instance {{ $labels.instance }} has Critical Memory Load", - description = "{{ $labels.instance }} has has Critical Memory Load more than 5 minutes.", - } -ALERT WarningMemoryLoad - IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85 - FOR 5m - LABELS { severity = "warning" } - ANNOTATIONS { - summary = "Instance {{ $labels.instance }} has Warning Memory Load", - description = "{{ $labels.instance }} has has Warning Memory Load more than 5 minutes.", - } -#ALERT ContainerFlapping -# IF time() - container_start_time_seconds < 50 -# FOR 1m -# LABELS { severity = "critical" } -# ANNOTATIONS { -# summary = "{{ $labels.instance }}", -# description = "{{ $labels.instance }} lives less than a minute" -# } -ALERT ContainerStopped - IF time() - container_last_seen > 60 * 5 - LABELS { severity = "critical" } - ANNOTATIONS { - summary = "Container {{$labels.image}} stopped", - description = "Container {{$labels.image}} has been stopped on {{$labels.host}}" - } -ALERT InstanceLowDisk - IF node_filesystem_avail{mountpoint="/etc/hosts"} < 10737418240 - FOR 10m - LABELS { severity = "arning" } - ANNOTATIONS { - summary = "Instance {{$labels.host}}: low disk space", - description = "{{$labels.host}} has less than 10G FS space" - } -ALERT InstanceLowMemory - IF node_memory_MemAvailable < 268435456 - FOR 10m - LABELS { severity = "warning" } - ANNOTATIONS { - summary = "Instance {{$labels.host}}: memory low", - description = "{{$labels.host}} has less than 256M memory available" - } -ALERT DiskWillFillIn24Hours - IF predict_linear(node_filesystem_free{job='node'}[1h], 24*3600) < 0 - FOR 5m - LABELS { severity="warning" } - ANNOTATIONS { - summary = "Instance {{$labels.host}}:disk was going to fill up", - description = "{{$labels.host}} :disk was going to fill up in 4 hours" - } -ALERT isaacloudPrometheusDown - IF up {instance="localhost:9092"} == 0 - FOR 5m - LABELS {severity="critical"} - ANNOTATIONS { - summary = "isaacloud Prometheus server is down!", - description="This is critical alert, please check corresponding Prometheus instance.Wrong or missing data can result in wrong representation and/or interpretation of collected metrics." - } -ALERT CriticalNodeLoad - IF node_load15 > (count(node_cpu{mode="idle"}) without (cpu,mode))*2 - FOR 2m - LABELS {severity="critical"} - ANNOTATIONS { - summary = "Instance {{$labels.host}} has critical node load", - description = "Instance {{$labels.host}} has critical node load for more than 15 minute" - } +groups: +- name: /etc/prometheus/rules/alert.rules + rules: + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.' + summary: Instance {{ $labels.instance }} down + - alert: CriticalCPULoad + expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 96 + for: 2m + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical High CPU load for more than 1 minutes.' + summary: Instance {{ $labels.instance }} High CPU load + - alert: WarningCPULoad + expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 90 + for: 2m + labels: + severity: warning + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.' + summary: Instance {{ $labels.instance }} High CPU load + - alert: CriticalMemoryLoad + expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 95 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} has has Critical Memory Load more than + 5 minutes.' + summary: Instance {{ $labels.instance }} has Critical Memory Load + - alert: WarningMemoryLoad + expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + description: '{{ $labels.instance }} has has Warning Memory Load more than 5 + minutes.' + summary: Instance {{ $labels.instance }} has Warning Memory Load diff --git a/tasks/configure.yml b/tasks/configure.yml index 0eda6db7..936a65a6 100644 --- a/tasks/configure.yml +++ b/tasks/configure.yml @@ -6,7 +6,7 @@ owner: prometheus group: prometheus mode: 0644 - validate: "{{ prometheus_root_dir }}/promtool check-rules %s" + validate: "{{ prometheus_root_dir }}/promtool check rules %s" when: prometheus_rules_files != [] and prometheus_alertmanager_url != "" notify: - reload prometheus @@ -19,7 +19,7 @@ owner: prometheus group: prometheus mode: 0644 - validate: "{{ prometheus_root_dir }}/promtool check-config %s" + validate: "{{ prometheus_root_dir }}/promtool check config %s" with_first_found: - files: - prometheus.yml.j2 diff --git a/templates/prometheus.service.j2 b/templates/prometheus.service.j2 index 1eae17ba..389ce342 100644 --- a/templates/prometheus.service.j2 +++ b/templates/prometheus.service.j2 @@ -9,11 +9,11 @@ User=prometheus Group=prometheus ExecReload=/bin/kill -HUP $MAINPID ExecStart={{ prometheus_root_dir }}/prometheus \ - -config.file={{ prometheus_config_dir }}/prometheus.yml \ - -storage.local.path={{ prometheus_db_dir }} \ - -web.listen-address={{ prometheus_web_listen_address }} \ - -web.external-url={{ prometheus_web_external_url }}{% for flag, flag_value in prometheus_config_flags_extra.iteritems() %}\ - -{{ flag }}={{ flag_value }} {% endfor %} + --config.file={{ prometheus_config_dir }}/prometheus.yml \ + --web.listen-address={{ prometheus_web_listen_address }} \ + --storage.tsdb.path={{ prometheus_db_dir }} \ + --web.external-url={{ prometheus_web_external_url }}{% for flag, flag_value in prometheus_config_flags_extra.iteritems() %}\ + --{{ flag }}={{ flag_value }} {% endfor %} SyslogIdentifier=prometheus Restart=always