Skip to content
Snippets Groups Projects
Verified Commit 62282ca9 authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Add dashboard links in alerts annotations and better format floats

parent c61b41c3
No related branches found
No related tags found
1 merge request!65Add alerting
......@@ -2,14 +2,14 @@ groups:
# The name of the group. Must be unique within a file.
- name: disk
rules:
- record: instance:node_fs:avail_size
expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100
- record: instance:node_fs:space_used
expr: 100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)
- record: instance:node_disk:temperature
expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
# The name of the alert. Must be a valid metric name.
- alert: DiskFull
# The PromQL expression to evaluate
expr: instance:node_fs:avail_size < 20
expr: instance:node_fs:space_used > 80
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
......@@ -19,44 +19,49 @@ groups:
severity: warning
annotations:
summary: Disk 80% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: DiskFull
expr: instance:node_fs:avail_size < 5
expr: instance:node_fs:space_used > 95
for: "15m"
labels:
severity: critical
annotations:
summary: Disk 95% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
# missing from pve_disk_* but present in pve_storage_info with join on `id`
expr: (pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
expr: ((pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "6h"
labels:
severity: warning
annotations:
summary: Proxmox backup volume 90% full
description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
- alert: VMStorageSSDFull
# SSD storage is always called "local" at Picasoft.
expr: (pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
expr: ((pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "15m"
labels:
severity: critical
annotations:
summary: Proxmox SSD volume 90% full
description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
- alert: VMStorageHDDFull
# HDD storage always has "hdd" in its name at Picasoft.
expr: (pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
expr: ((pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "15m"
labels:
severity: critical
annotations:
summary: Proxmox HDD volume 90% full
description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
- alert: DiskDamaged
# Only get values from real disks so ignore VMs
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
......@@ -67,6 +72,7 @@ groups:
annotations:
summary: Physical disk unhealthy
description: Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values
dashboard: https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }}
- alert: RaidDegraded
expr: (node_md_disks - node_md_disks_active) != 0
labels:
......@@ -74,6 +80,7 @@ groups:
annotations:
summary: RAID on node {{ $labels.instance }} is in degrade mode
description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed"
dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }}
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 50
for: "5m"
......@@ -81,7 +88,8 @@ groups:
severity: warning
annotations:
summary: Disk temperature > 50°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 60
for: "5m"
......@@ -89,7 +97,8 @@ groups:
severity: critical
annotations:
summary: Disk temperature > 60°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- name: cpu
rules:
......@@ -122,7 +131,8 @@ groups:
severity: warning
annotations:
summary: CPU usage over 90%
description: CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes
description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 60
for: "5m"
......@@ -130,7 +140,8 @@ groups:
severity: warning
annotations:
summary: CPU temperature over 60°C
description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 80
for: "5m"
......@@ -138,7 +149,8 @@ groups:
severity: warning
annotations:
summary: CPU temperature over 80°C
description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- name: ram
rules:
......@@ -151,7 +163,8 @@ groups:
severity: warning
annotations:
summary: More than 80% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: HighRAMUse
expr: instance:node_memory < 5
for: "10h"
......@@ -159,7 +172,8 @@ groups:
severity: critical
annotations:
summary: More than 95% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- name: network
rules:
......@@ -171,6 +185,7 @@ groups:
annotations:
summary: Network interface is reporting many receive errors
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- alert: SendHighErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: "5m"
......@@ -179,6 +194,7 @@ groups:
annotations:
summary: Network interface is reporting many transmit errors
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- alert: ReceiveHighDrop
expr: rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: "5m"
......@@ -187,6 +203,7 @@ groups:
annotations:
summary: Network interface is reporting many receive drops
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive drops in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- alert: SendHighDrop
expr: rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: "5m"
......@@ -195,6 +212,7 @@ groups:
annotations:
summary: Network interface is reporting many transmit drops
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit drops in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- name: services
rules:
......@@ -206,6 +224,7 @@ groups:
annotations:
summary: Lot of 4XX errors
description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
- alert: 500Errors
expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50
for: "15m"
......@@ -214,6 +233,7 @@ groups:
annotations:
summary: Lot of 5XX errors
description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
- alert: EndpointDown
expr: probe_success == 0
for: "2m"
......@@ -222,6 +242,8 @@ groups:
annotations:
summary: "Service down"
description: "{{ $labels.instance }} is down for more than 2 minutes"
# Redirect to HTTP or DNS dashboard based on vmagent job name
dashboard: '{{ if eq $labels.job "blackbox-http" -}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-dns" -}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{ $labels.instance }}{{- end -}}'
- alert: PostfixRejection
# As a test, alert when there is any rejection. If too much alert we will raise the threshold
expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0
......@@ -231,3 +253,4 @@ groups:
annotations:
summary: Mail rejection
description: At least one mail sent from {{ $labels.instance }} have been rejected
dashboard: https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment