Skip to content
This repository has been archived by the owner on Mar 6, 2023. It is now read-only.

Commit

Permalink
compose alert rules to one simple file
Browse files Browse the repository at this point in the history
  • Loading branch information
rdemachkovych committed Nov 22, 2017
1 parent 3fba4ba commit 0867a2a
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 99 deletions.
61 changes: 7 additions & 54 deletions files/rules/alert.rules
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,26 @@ groups:
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 5 minutes.'
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: CriticalCPULoad
expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance)))
> 96
expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 96
for: 2m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical High
CPU load for more than 1 minutes.'
description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical High CPU load for more than 1 minutes.'
summary: Instance {{ $labels.instance }} High CPU load
- alert: WarningCPULoad
expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance)))
> 90
expr: (100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 90
for: 2m
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has Warning High
CPU load for more than 1 minutes.'
description: '{{ $labels.instance }} of job {{ $labels.job }} has Warning High CPU load for more than 1 minutes.'
summary: Instance {{ $labels.instance }} High CPU load
- alert: CriticalMemoryLoad
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 95
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 95
for: 5m
labels:
severity: critical
Expand All @@ -41,52 +35,11 @@ groups:
5 minutes.'
summary: Instance {{ $labels.instance }} has Critical Memory Load
- alert: WarningMemoryLoad
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} has has Warning Memory Load more than 5
minutes.'
summary: Instance {{ $labels.instance }} has Warning Memory Load
- alert: ContainerStopped
expr: time() - container_last_seen > 60 * 5
labels:
severity: critical
annotations:
description: Container {{$labels.image}} has been stopped on {{$labels.host}}
summary: Container {{$labels.image}} stopped
- alert: InstanceLowDisk
expr: node_filesystem_avail{mountpoint="/etc/hosts"} < 1.073741824e+10
for: 10m
labels:
severity: arning
annotations:
description: '{{$labels.host}} has less than 10G FS space'
summary: 'Instance {{$labels.host}}: low disk space'
- alert: InstanceLowMemory
expr: node_memory_MemAvailable < 2.68435456e+08
for: 10m
labels:
severity: warning
annotations:
description: '{{$labels.host}} has less than 256M memory available'
summary: 'Instance {{$labels.host}}: memory low'
- alert: DiskWillFillIn24Hours
expr: predict_linear(node_filesystem_free{job="node"}[1h], 24 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
description: '{{$labels.host}} :disk was going to fill up in 4 hours'
summary: Instance {{$labels.host}}:disk was going to fill up
- alert: CriticalNodeLoad
expr: node_load15 > (count(node_cpu{mode="idle"}) WITHOUT (cpu, mode)) * 2
for: 2m
labels:
severity: critical
annotations:
description: Instance {{$labels.host}} has critical node load for more than
15 minute
summary: Instance {{$labels.host}} has critical node load
45 changes: 0 additions & 45 deletions files/rules/main_alert.rules

This file was deleted.

0 comments on commit 0867a2a

Please sign in to comment.