Skip to content
This repository has been archived by the owner on Mar 6, 2023. It is now read-only.

Commit

Permalink
Merge pull request #22 from SoInteractive/update_2.0
Browse files Browse the repository at this point in the history
Update ansible-prometheus roles to use Prometheus 2.0
  • Loading branch information
Paweł Krupa authored Nov 22, 2017
2 parents 9c39dd0 + 0867a2a commit d2c1cd3
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 103 deletions.
2 changes: 1 addition & 1 deletion defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
prometheus_version: 1.8.0
prometheus_version: 2.0.0

prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
Expand Down
140 changes: 45 additions & 95 deletions files/rules/alert.rules
Original file line number Diff line number Diff line change
@@ -1,95 +1,45 @@
ALERT InstanceDown
IF up == 0
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} down",
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
}
ALERT CriticalCPULoad
IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > 96
FOR 2m
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} High CPU load",
description = "{{ $labels.instance }} of job {{ $labels.job }} has Critical High CPU load for more than 1 minutes.",
}
ALERT WarningCPULoad
IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > 90
FOR 2m
LABELS { severity = "warning" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} High CPU load",
description = "{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.",
}
ALERT CriticalMemoryLoad
IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 95
FOR 5m
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} has Critical Memory Load",
description = "{{ $labels.instance }} has has Critical Memory Load more than 5 minutes.",
}
ALERT WarningMemoryLoad
IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85
FOR 5m
LABELS { severity = "warning" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} has Warning Memory Load",
description = "{{ $labels.instance }} has has Warning Memory Load more than 5 minutes.",
}
#ALERT ContainerFlapping
# IF time() - container_start_time_seconds < 50
# FOR 1m
# LABELS { severity = "critical" }
# ANNOTATIONS {
# summary = "{{ $labels.instance }}",
# description = "{{ $labels.instance }} lives less than a minute"
# }
ALERT ContainerStopped
IF time() - container_last_seen > 60 * 5
LABELS { severity = "critical" }
ANNOTATIONS {
summary = "Container {{$labels.image}} stopped",
description = "Container {{$labels.image}} has been stopped on {{$labels.host}}"
}
ALERT InstanceLowDisk
IF node_filesystem_avail{mountpoint="/etc/hosts"} < 10737418240
FOR 10m
LABELS { severity = "arning" }
ANNOTATIONS {
summary = "Instance {{$labels.host}}: low disk space",
description = "{{$labels.host}} has less than 10G FS space"
}
ALERT InstanceLowMemory
IF node_memory_MemAvailable < 268435456
FOR 10m
LABELS { severity = "warning" }
ANNOTATIONS {
summary = "Instance {{$labels.host}}: memory low",
description = "{{$labels.host}} has less than 256M memory available"
}
ALERT DiskWillFillIn24Hours
IF predict_linear(node_filesystem_free{job='node'}[1h], 24*3600) < 0
FOR 5m
LABELS { severity="warning" }
ANNOTATIONS {
summary = "Instance {{$labels.host}}:disk was going to fill up",
description = "{{$labels.host}} :disk was going to fill up in 4 hours"
}
ALERT isaacloudPrometheusDown
IF up {instance="localhost:9092"} == 0
FOR 5m
LABELS {severity="critical"}
ANNOTATIONS {
summary = "isaacloud Prometheus server is down!",
description="This is critical alert, please check corresponding Prometheus instance.Wrong or missing data can result in wrong representation and/or interpretation of collected metrics."
}
ALERT CriticalNodeLoad
IF node_load15 > (count(node_cpu{mode="idle"}) without (cpu,mode))*2
FOR 2m
LABELS {severity="critical"}
ANNOTATIONS {
summary = "Instance {{$labels.host}} has critical node load",
description = "Instance {{$labels.host}} has critical node load for more than 15 minute"
}
groups:
- name: /etc/prometheus/rules/alert.rules
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: CriticalCPULoad
expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 96
for: 2m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical High CPU load for more than 1 minutes.'
summary: Instance {{ $labels.instance }} High CPU load
- alert: WarningCPULoad
expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 90
for: 2m
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.'
summary: Instance {{ $labels.instance }} High CPU load
- alert: CriticalMemoryLoad
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 95
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} has has Critical Memory Load more than
5 minutes.'
summary: Instance {{ $labels.instance }} has Critical Memory Load
- alert: WarningMemoryLoad
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} has has Warning Memory Load more than 5
minutes.'
summary: Instance {{ $labels.instance }} has Warning Memory Load
4 changes: 2 additions & 2 deletions tasks/configure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
owner: prometheus
group: prometheus
mode: 0644
validate: "{{ prometheus_root_dir }}/promtool check-rules %s"
validate: "{{ prometheus_root_dir }}/promtool check rules %s"
when: prometheus_rules_files != [] and prometheus_alertmanager_url != ""
notify:
- reload prometheus
Expand All @@ -19,7 +19,7 @@
owner: prometheus
group: prometheus
mode: 0644
validate: "{{ prometheus_root_dir }}/promtool check-config %s"
validate: "{{ prometheus_root_dir }}/promtool check config %s"
with_first_found:
- files:
- prometheus.yml.j2
Expand Down
10 changes: 5 additions & 5 deletions templates/prometheus.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart={{ prometheus_root_dir }}/prometheus \
-config.file={{ prometheus_config_dir }}/prometheus.yml \
-storage.local.path={{ prometheus_db_dir }} \
-web.listen-address={{ prometheus_web_listen_address }} \
-web.external-url={{ prometheus_web_external_url }}{% for flag, flag_value in prometheus_config_flags_extra.iteritems() %}\
-{{ flag }}={{ flag_value }} {% endfor %}
--config.file={{ prometheus_config_dir }}/prometheus.yml \
--web.listen-address={{ prometheus_web_listen_address }} \
--storage.tsdb.path={{ prometheus_db_dir }} \
--web.external-url={{ prometheus_web_external_url }}{% for flag, flag_value in prometheus_config_flags_extra.iteritems() %}\
--{{ flag }}={{ flag_value }} {% endfor %}

SyslogIdentifier=prometheus
Restart=always
Expand Down

0 comments on commit d2c1cd3

Please sign in to comment.