Merge pull request #22 from SoInteractive/update_2.0

Update ansible-prometheus roles to use Prometheus 2.0
cloudalchemy · Nov 22, 2017 · d2c1cd3 · d2c1cd3
2 parents 9c39dd0 + 0867a2a
commit d2c1cd3
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 103 deletions.
diff --git a/defaults/main.yml b/defaults/main.yml
@@ -1,5 +1,5 @@
 ---
-prometheus_version: 1.8.0
+prometheus_version: 2.0.0
 
 prometheus_config_dir: /etc/prometheus
 prometheus_db_dir: /var/lib/prometheus

diff --git a/files/rules/alert.rules b/files/rules/alert.rules
@@ -1,95 +1,45 @@
-ALERT InstanceDown
-  IF up == 0
-  FOR 5m
-  LABELS { severity = "critical" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }} down",
-    description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
-  }
-ALERT CriticalCPULoad
-  IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > 96
-  FOR 2m
-  LABELS { severity = "critical" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }} High CPU load",
-    description = "{{ $labels.instance }} of job {{ $labels.job }} has Critical  High CPU load for more than 1 minutes.",
-  }
-ALERT WarningCPULoad
-  IF (100 * (1 - avg by(instance)(irate(node_cpu{job='node',mode='idle'}[5m])))) > 90
-  FOR 2m
-  LABELS { severity = "warning" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }} High CPU load",
-    description = "{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.",
-  }
-ALERT CriticalMemoryLoad
-  IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 95
-  FOR 5m
-  LABELS { severity = "critical" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }} has Critical Memory Load",
-    description = "{{ $labels.instance }} has has Critical Memory Load more than 5 minutes.",
-  }
-ALERT WarningMemoryLoad
-  IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85
-  FOR 5m
-  LABELS { severity = "warning" }
-  ANNOTATIONS {
-    summary = "Instance {{ $labels.instance }} has Warning Memory Load",
-    description = "{{ $labels.instance }} has has Warning Memory Load more than 5 minutes.",
-  }
-#ALERT ContainerFlapping
-#  IF time() - container_start_time_seconds < 50
-#  FOR 1m
-#  LABELS { severity = "critical" }
-#  ANNOTATIONS {
-#    summary = "{{ $labels.instance }}",
-#    description = "{{ $labels.instance }} lives less than a minute"
-#  }
-ALERT ContainerStopped
-  IF time() - container_last_seen > 60 * 5
-  LABELS { severity = "critical" }
-  ANNOTATIONS {
-    summary = "Container {{$labels.image}} stopped",
-    description = "Container {{$labels.image}} has been stopped on {{$labels.host}}"
-  }
-ALERT InstanceLowDisk
-  IF node_filesystem_avail{mountpoint="/etc/hosts"} < 10737418240
-  FOR 10m
-  LABELS { severity = "arning" }
-  ANNOTATIONS {
-    summary = "Instance {{$labels.host}}: low disk space",
-    description = "{{$labels.host}} has less than 10G FS space"
-  }
-ALERT InstanceLowMemory
-  IF node_memory_MemAvailable < 268435456
-  FOR 10m
-  LABELS { severity = "warning" }
-  ANNOTATIONS {
-    summary = "Instance {{$labels.host}}: memory low",
-    description = "{{$labels.host}} has less than 256M memory available"
-  }
-ALERT DiskWillFillIn24Hours
-  IF predict_linear(node_filesystem_free{job='node'}[1h], 24*3600) < 0
-  FOR 5m
-  LABELS { severity="warning" }
-  ANNOTATIONS {
-    summary = "Instance {{$labels.host}}:disk was going to fill up",
-    description = "{{$labels.host}} :disk was going to fill up in 4 hours"
-  }
-ALERT isaacloudPrometheusDown
-  IF up {instance="localhost:9092"} == 0
-  FOR 5m
-  LABELS {severity="critical"}
-  ANNOTATIONS {
-    summary = "isaacloud Prometheus server is down!",
-    description="This is critical alert, please check corresponding Prometheus instance.Wrong or missing data can result in wrong representation and/or interpretation of collected metrics."
-  }
-ALERT CriticalNodeLoad
-  IF node_load15 > (count(node_cpu{mode="idle"}) without (cpu,mode))*2
-  FOR 2m
-  LABELS {severity="critical"}
-  ANNOTATIONS {
-  summary = "Instance {{$labels.host}} has critical node load",
-  description = "Instance {{$labels.host}} has critical node load for more than 15 minute"
-  }
+groups:
+- name: /etc/prometheus/rules/alert.rules
+  rules:
+  - alert: InstanceDown
+    expr: up == 0
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
+      summary: Instance {{ $labels.instance }} down
+  - alert: CriticalCPULoad
+    expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 96
+    for: 2m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical High CPU load for more than 1 minutes.'
+      summary: Instance {{ $labels.instance }} High CPU load
+  - alert: WarningCPULoad
+    expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 90
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.'
+      summary: Instance {{ $labels.instance }} High CPU load
+  - alert: CriticalMemoryLoad
+    expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 95
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.instance }} has has Critical Memory Load more than
+        5 minutes.'
+      summary: Instance {{ $labels.instance }} has Critical Memory Load
+  - alert: WarningMemoryLoad
+    expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $labels.instance }} has has Warning Memory Load more than 5
+        minutes.'
+      summary: Instance {{ $labels.instance }} has Warning Memory Load
diff --git a/tasks/configure.yml b/tasks/configure.yml
@@ -6,7 +6,7 @@
     owner: prometheus
     group: prometheus
     mode: 0644
-    validate: "{{ prometheus_root_dir }}/promtool check-rules %s"
+    validate: "{{ prometheus_root_dir }}/promtool check rules %s"
   when: prometheus_rules_files != [] and prometheus_alertmanager_url != ""
   notify:
     - reload prometheus
@@ -19,7 +19,7 @@
     owner: prometheus
     group: prometheus
     mode: 0644
-    validate: "{{ prometheus_root_dir }}/promtool check-config %s"
+    validate: "{{ prometheus_root_dir }}/promtool check config %s"
   with_first_found:
     - files:
         -  prometheus.yml.j2

diff --git a/templates/prometheus.service.j2 b/templates/prometheus.service.j2
@@ -9,11 +9,11 @@ User=prometheus
 Group=prometheus
 ExecReload=/bin/kill -HUP $MAINPID
 ExecStart={{ prometheus_root_dir }}/prometheus \
-  -config.file={{ prometheus_config_dir }}/prometheus.yml \
-  -storage.local.path={{ prometheus_db_dir }} \
-  -web.listen-address={{ prometheus_web_listen_address }} \
-  -web.external-url={{ prometheus_web_external_url }}{% for flag, flag_value in prometheus_config_flags_extra.iteritems() %}\
-  -{{ flag }}={{ flag_value }} {% endfor %}
+  --config.file={{ prometheus_config_dir }}/prometheus.yml \
+  --web.listen-address={{ prometheus_web_listen_address }} \
+  --storage.tsdb.path={{ prometheus_db_dir }} \
+  --web.external-url={{ prometheus_web_external_url }}{% for flag, flag_value in prometheus_config_flags_extra.iteritems() %}\
+  --{{ flag }}={{ flag_value }} {% endfor %}
 
 SyslogIdentifier=prometheus
 Restart=always