Verified Commit 7fa2bb33 authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Add initial set of alerting rules (CPU, RAM, volumes, disks, services, HTTP errors)

parent 72f3b353
......@@ -80,7 +80,7 @@ services:
# Where to write and read alert states, to keep
# state during restart, as vmagent stores states in memory
- "-remoteWrite.url=http://victoria-metrics:8428"
- "-remoteRead.url=http://victoriametrics:8428"
- "-remoteRead.url=http://victoria-metrics:8428"
# Where to send alert when they must be triggered
- "-notifier.url=http://alertmanager:9093"
# HTTP server for vmagent's own metrics
......
groups:
# The name of the group. Must be unique within a file.
- name: disk
rules:
- record: instance:node_fs:avail_size
expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100
- record: instance:node_disk:temperature
expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
# The name of the alert. Must be a valid metric name.
- alert: DiskFull
# The PromQL expression to evaluate
expr: instance:node_fs:avail_size < 20
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
# as firing once they return.
for: "1h"
labels:
severity: warning
annotations:
summary: Disk 80% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
- alert: DiskFull
expr: instance:node_fs:avail_size < 5
for: "15m"
labels:
severity: critical
annotations:
summary: Disk 95% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
- alert: DiskFullSoon
# Based on the diminution of filesystem available space and a simple linear regression,
# check if the available space could reach 0 within 24 hours at the same rate (e.g. mass upload of files)
expr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0
for: "30m"
labels:
severity: warning
annotations:
summary: Disk expected to be full soon
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} could be full in within 24 hours
- alert: VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
# missing from pve_disk_* but present in pve_storage_info with join on `id`
expr: (pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
for: "6h"
labels:
severity: warning
annotations:
summary: Proxmox backup volume 90% full
description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
- alert: VMStorageSSDFull
# SSD storage is always called "local" at Picasoft.
expr: (pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
for: "15m"
labels:
severity: critical
annotations:
summary: Proxmox SSD volume 90% full
description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
- alert: VMStorageHDDFull
# HDD storage always has "hdd" in its name at Picasoft.
expr: (pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
for: "15m"
labels:
severity: critical
annotations:
summary: Proxmox HDD volume 90% full
description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
- alert: DiskDamaged
expr: smartmon_device_smart_healthy != 1
labels:
severity: critical
annotations:
summary: Physical disk unhealthy
description: Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values
- alert: RaidDegraded
expr: (node_md_disks - node_md_disks_active) != 0
labels:
severity: warning
annotations:
summary: RAID on node {{ $labels.instance }} is in degrade mode
description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed"
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 50
for: "5m"
labels:
severity: warning
annotations:
summary: Disk temperature > 50°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 60
for: "5m"
labels:
severity: critical
annotations:
summary: Disk temperature > 60°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
- name: cpu
rules:
# Records are useful to pre-compute metrics
# and re-use them in alerting rules
# The count of CPUs per node, useful for getting CPU time as a percent of total.
- record: instance:node_cpus:count
expr: >
count without (cpu, mode) (
node_cpu_seconds_total{mode="idle"}
)
# CPU in use by mode.
- record: instance_mode:node_cpu_seconds:rate1m
expr: >
sum without (cpu) (
rate(node_cpu_seconds_total[1m])
)
# CPU in use ratio.
- record: instance:node_cpu_utilization:ratio
expr: >
sum without (mode) (
instance_mode:node_cpu_seconds:rate1m{mode!="idle"}
) / instance:node_cpus:count
- record: instance:node_cpu_temperature
expr: avg(node_hwmon_temp_celsius) by (instance)
- alert: HighCPU
expr: instance:node_cpu_utilization:ratio * 100 > 90
for: "30m"
labels:
severity: warning
annotations:
summary: CPU usage over 90%
description: CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 60
for: "5m"
labels:
severity: warning
annotations:
summary: CPU temperature over 60°C
description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 80
for: "5m"
labels:
severity: warning
annotations:
summary: CPU temperature over 80°C
description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
- name: ram
rules:
- record: instance:node_memory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- alert: HighRAMUse
expr: instance:node_memory < 20
for: "1h"
labels:
severity: warning
annotations:
summary: More than 80% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
- alert: HighRAMUse
expr: instance:node_memory < 5
for: "10h"
labels:
severity: critical
annotations:
summary: More than 95% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
- name: network
rules:
- alert: ReceiveHighErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: "5m"
labels:
severity: warning
annotations:
summary: Network interface is reporting many receive errors
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
- alert: SendHighErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: "5m"
labels:
severity: warning
annotations:
summary: Network interface is reporting many transmit errors
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
- alert: ReceiveHighDrop
expr: rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: "5m"
labels:
severity: warning
annotations:
summary: Network interface is reporting many receive drops
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive drops in the last two minutes.'
- alert: SendHighDrop
expr: rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: "5m"
labels:
severity: warning
annotations:
summary: Network interface is reporting many transmit drops
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit drops in the last two minutes.'
- name: services
rules:
- alert: 404Errors
expr: increase(traefik_service_requests_total{code=~"4[0-9][0-8]"}[15m]) > 50
for: "15m"
labels:
severity: warning
annotations:
summary: Lot of 4XX errors
description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
- alert: 500Errors
expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50
for: "15m"
labels:
severity: warning
annotations:
summary: Lot of 5XX errors
description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
- alert: EndpointDown
expr: probe_success == 0
for: "2m"
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} is down"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment