Skip to content
Snippets Groups Projects
Verified Commit 0743d98d authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Remove useless warning alerts

parent 4a824486
Branches
No related tags found
1 merge request!65Add alerting
......@@ -2,32 +2,14 @@ groups:
# The name of the group. Must be unique within a file.
- name: disk
rules:
- record: instance:node_fs:space_used
expr: 100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)
- record: instance:node_disk:temperature
expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
# The name of the alert. Must be a valid metric name.
- alert: DiskFull
# The PromQL expression to evaluate
expr: instance:node_fs:space_used > 80
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
# as firing once they return.
expr: (100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)) > 90
for: "10m"
labels:
severity: warning
annotations:
summary: Disk 80% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: DiskFull
expr: instance:node_fs:space_used > 95
for: "10m"
labels:
severity: critical
annotations:
summary: Disk 95% full on {{ $labels.instance }}
summary: Disk 90% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: VMBackupFull
......@@ -84,16 +66,7 @@ groups:
description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed"
dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }}
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 50
for: "5m"
labels:
severity: warning
annotations:
summary: Disk temperature > 50°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 60
expr: (avg(smartmon_temperature_celsius_raw_value) by (instance, disk)) > 60
for: "5m"
labels:
severity: critical
......@@ -135,15 +108,6 @@ groups:
summary: CPU usage over 90%
description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 60
for: "5m"
labels:
severity: warning
annotations:
summary: CPU temperature over 60°C
description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 80
for: "5m"
......@@ -156,24 +120,13 @@ groups:
- name: ram
rules:
- record: instance:node_memory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- alert: HighRAMUse
expr: instance:node_memory < 20
expr: ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100) < 10
for: "10m"
labels:
severity: warning
annotations:
summary: More than 80% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: HighRAMUse
expr: instance:node_memory < 5
for: "10m"
labels:
severity: critical
annotations:
summary: More than 95% of RAM is used
summary: More than 90% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment