Verified Commit 4a824486 authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Lower the time before triggering an alert for now

parent 62282ca9
......@@ -64,7 +64,7 @@ receivers:
value: '[Silence the alert]({{ template "__alert_silence_link" . }})'
short: true
- title: ':question: Documentation'
value: "[Show documentation](https://wiki.picasoft.net/doku.php?id=technique:adminsys:monitoring:metrologie:stack-picasoft)"
value: "[See the wiki](https://wiki.picasoft.net/doku.php?id=technique:adminsys:monitoring:metrologie:stack-picasoft)"
short: true
- title: ':fire: See all alerts'
value: "[AlertManager WebUI](https://alertmanager.picasoft.net)"
......
......@@ -14,7 +14,7 @@ groups:
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
# as firing once they return.
for: "1h"
for: "10m"
labels:
severity: warning
annotations:
......@@ -23,7 +23,7 @@ groups:
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: DiskFull
expr: instance:node_fs:space_used > 95
for: "15m"
for: "10m"
labels:
severity: critical
annotations:
......@@ -45,7 +45,7 @@ groups:
- alert: VMStorageSSDFull
# SSD storage is always called "local" at Picasoft.
expr: ((pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "15m"
for: "10m"
labels:
severity: critical
annotations:
......@@ -55,7 +55,7 @@ groups:
- alert: VMStorageHDDFull
# HDD storage always has "hdd" in its name at Picasoft.
expr: ((pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "15m"
for: "10m"
labels:
severity: critical
annotations:
......@@ -67,6 +67,7 @@ groups:
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
# We must add new machines here
expr: smartmon_device_smart_healthy{instance=~"alice|bob"} != 1
for: "1m"
labels:
severity: critical
annotations:
......@@ -75,6 +76,7 @@ groups:
dashboard: https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }}
- alert: RaidDegraded
expr: (node_md_disks - node_md_disks_active) != 0
for: "1m"
labels:
severity: warning
annotations:
......@@ -126,7 +128,7 @@ groups:
expr: avg(node_hwmon_temp_celsius) by (instance)
- alert: HighCPU
expr: instance:node_cpu_utilization:ratio * 100 > 90
for: "30m"
for: "10m"
labels:
severity: warning
annotations:
......@@ -158,7 +160,7 @@ groups:
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- alert: HighRAMUse
expr: instance:node_memory < 20
for: "1h"
for: "10m"
labels:
severity: warning
annotations:
......@@ -167,7 +169,7 @@ groups:
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: HighRAMUse
expr: instance:node_memory < 5
for: "10h"
for: "10m"
labels:
severity: critical
annotations:
......@@ -218,7 +220,7 @@ groups:
rules:
- alert: 404Errors
expr: increase(traefik_service_requests_total{code=~"4[0-9][0-8]"}[15m]) > 50
for: "15m"
for: "2m"
labels:
severity: warning
annotations:
......@@ -227,7 +229,7 @@ groups:
dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
- alert: 500Errors
expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50
for: "15m"
for: "2m"
labels:
severity: warning
annotations:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment