Verified Commit 62282ca9 authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Add dashboard links in alerts annotations and better format floats

parent c61b41c3
......@@ -2,14 +2,14 @@ groups:
# The name of the group. Must be unique within a file.
- name: disk
rules:
- record: instance:node_fs:avail_size
expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100
- record: instance:node_fs:space_used
expr: 100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)
- record: instance:node_disk:temperature
expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
# The name of the alert. Must be a valid metric name.
- alert: DiskFull
# The PromQL expression to evaluate
expr: instance:node_fs:avail_size < 20
expr: instance:node_fs:space_used > 80
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
......@@ -19,44 +19,49 @@ groups:
severity: warning
annotations:
summary: Disk 80% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: DiskFull
expr: instance:node_fs:avail_size < 5
expr: instance:node_fs:space_used > 95
for: "15m"
labels:
severity: critical
annotations:
summary: Disk 95% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
# missing from pve_disk_* but present in pve_storage_info with join on `id`
expr: (pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
expr: ((pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "6h"
labels:
severity: warning
annotations:
summary: Proxmox backup volume 90% full
description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
- alert: VMStorageSSDFull
# SSD storage is always called "local" at Picasoft.
expr: (pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
expr: ((pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "15m"
labels:
severity: critical
annotations:
summary: Proxmox SSD volume 90% full
description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
- alert: VMStorageHDDFull
# HDD storage always has "hdd" in its name at Picasoft.
expr: (pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
expr: ((pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
for: "15m"
labels:
severity: critical
annotations:
summary: Proxmox HDD volume 90% full
description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
- alert: DiskDamaged
# Only get values from real disks so ignore VMs
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
......@@ -67,6 +72,7 @@ groups:
annotations:
summary: Physical disk unhealthy
description: Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values
dashboard: https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }}
- alert: RaidDegraded
expr: (node_md_disks - node_md_disks_active) != 0
labels:
......@@ -74,6 +80,7 @@ groups:
annotations:
summary: RAID on node {{ $labels.instance }} is in degrade mode
description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed"
dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }}
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 50
for: "5m"
......@@ -81,7 +88,8 @@ groups:
severity: warning
annotations:
summary: Disk temperature > 50°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: DiskHighTemperature
expr: instance:node_disk:temperature > 60
for: "5m"
......@@ -89,7 +97,8 @@ groups:
severity: critical
annotations:
summary: Disk temperature > 60°C
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- name: cpu
rules:
......@@ -122,7 +131,8 @@ groups:
severity: warning
annotations:
summary: CPU usage over 90%
description: CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes
description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 60
for: "5m"
......@@ -130,7 +140,8 @@ groups:
severity: warning
annotations:
summary: CPU temperature over 60°C
description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
- alert: HighCPUTemperature
expr: instance:node_cpu_temperature > 80
for: "5m"
......@@ -138,7 +149,8 @@ groups:
severity: warning
annotations:
summary: CPU temperature over 80°C
description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- name: ram
rules:
......@@ -151,7 +163,8 @@ groups:
severity: warning
annotations:
summary: More than 80% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- alert: HighRAMUse
expr: instance:node_memory < 5
for: "10h"
......@@ -159,7 +172,8 @@ groups:
severity: critical
annotations:
summary: More than 95% of RAM is used
description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
- name: network
rules:
......@@ -171,6 +185,7 @@ groups:
annotations:
summary: Network interface is reporting many receive errors
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- alert: SendHighErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: "5m"
......@@ -179,6 +194,7 @@ groups:
annotations:
summary: Network interface is reporting many transmit errors
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- alert: ReceiveHighDrop
expr: rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: "5m"
......@@ -187,6 +203,7 @@ groups:
annotations:
summary: Network interface is reporting many receive drops
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive drops in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- alert: SendHighDrop
expr: rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: "5m"
......@@ -195,6 +212,7 @@ groups:
annotations:
summary: Network interface is reporting many transmit drops
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit drops in the last two minutes.'
dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
- name: services
rules:
......@@ -206,6 +224,7 @@ groups:
annotations:
summary: Lot of 4XX errors
description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
- alert: 500Errors
expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50
for: "15m"
......@@ -214,6 +233,7 @@ groups:
annotations:
summary: Lot of 5XX errors
description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
- alert: EndpointDown
expr: probe_success == 0
for: "2m"
......@@ -222,6 +242,8 @@ groups:
annotations:
summary: "Service down"
description: "{{ $labels.instance }} is down for more than 2 minutes"
# Redirect to HTTP or DNS dashboard based on vmagent job name
dashboard: '{{ if eq $labels.job "blackbox-http" -}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-dns" -}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{ $labels.instance }}{{- end -}}'
- alert: PostfixRejection
# As a test, alert when there is any rejection. If too much alert we will raise the threshold
expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0
......@@ -231,3 +253,4 @@ groups:
annotations:
summary: Mail rejection
description: At least one mail sent from {{ $labels.instance }} have been rejected
dashboard: https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment