diff --git a/pica-metrologie/vmalert-rules.yml b/pica-metrologie/vmalert-rules.yml index 8b4575359cde8d3818e9bdf6c9836ea2c9221978..fd9b40bc9257981c782e35a1747904c24dd90d58 100644 --- a/pica-metrologie/vmalert-rules.yml +++ b/pica-metrologie/vmalert-rules.yml @@ -2,14 +2,14 @@ groups: # The name of the group. Must be unique within a file. - name: disk rules: - - record: instance:node_fs:avail_size - expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100 + - record: instance:node_fs:space_used + expr: 100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100) - record: instance:node_disk:temperature expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk) # The name of the alert. Must be a valid metric name. - alert: DiskFull # The PromQL expression to evaluate - expr: instance:node_fs:avail_size < 20 + expr: instance:node_fs:space_used > 80 # Alerts are considered firing once they have been returned for this long. # Alerts which have not yet fired for long enough are considered pending. # If param is omitted or set to 0 then alerts will be immediately considered @@ -19,44 +19,49 @@ groups: severity: warning annotations: summary: Disk 80% full on {{ $labels.instance }} - description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full + description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - alert: DiskFull - expr: instance:node_fs:avail_size < 5 + expr: instance:node_fs:space_used > 95 for: "15m" labels: severity: critical annotations: summary: Disk 95% full on {{ $labels.instance }} - description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full + description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - alert: VMBackupFull # Backup storage is always called "save" at Picasoft. # pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector # missing from pve_disk_* but present in pve_storage_info with join on `id` - expr: (pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90 + expr: ((pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90 for: "6h" labels: severity: warning annotations: summary: Proxmox backup volume 90% full - description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full + description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }} - alert: VMStorageSSDFull # SSD storage is always called "local" at Picasoft. - expr: (pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90 + expr: ((pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90 for: "15m" labels: severity: critical annotations: summary: Proxmox SSD volume 90% full - description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full + description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }} - alert: VMStorageHDDFull # HDD storage always has "hdd" in its name at Picasoft. - expr: (pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90 + expr: ((pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90 for: "15m" labels: severity: critical annotations: summary: Proxmox HDD volume 90% full - description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full + description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }} - alert: DiskDamaged # Only get values from real disks so ignore VMs # This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix @@ -67,6 +72,7 @@ groups: annotations: summary: Physical disk unhealthy description: Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values + dashboard: https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }} - alert: RaidDegraded expr: (node_md_disks - node_md_disks_active) != 0 labels: @@ -74,6 +80,7 @@ groups: annotations: summary: RAID on node {{ $labels.instance }} is in degrade mode description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed" + dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }} - alert: DiskHighTemperature expr: instance:node_disk:temperature > 50 for: "5m" @@ -81,7 +88,8 @@ groups: severity: warning annotations: summary: Disk temperature > 50°C - description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes + description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - alert: DiskHighTemperature expr: instance:node_disk:temperature > 60 for: "5m" @@ -89,7 +97,8 @@ groups: severity: critical annotations: summary: Disk temperature > 60°C - description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes + description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - name: cpu rules: @@ -122,7 +131,8 @@ groups: severity: warning annotations: summary: CPU usage over 90% - description: CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes + description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes + dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - alert: HighCPUTemperature expr: instance:node_cpu_temperature > 60 for: "5m" @@ -130,7 +140,8 @@ groups: severity: warning annotations: summary: CPU temperature over 60°C - description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }} + description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }} + dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - alert: HighCPUTemperature expr: instance:node_cpu_temperature > 80 for: "5m" @@ -138,7 +149,8 @@ groups: severity: warning annotations: summary: CPU temperature over 80°C - description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }} + description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }} + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - name: ram rules: @@ -151,7 +163,8 @@ groups: severity: warning annotations: summary: More than 80% of RAM is used - description: Available RAM on {{ $labels.instance }} is {{ $value }}%. + description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%. + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - alert: HighRAMUse expr: instance:node_memory < 5 for: "10h" @@ -159,7 +172,8 @@ groups: severity: critical annotations: summary: More than 95% of RAM is used - description: Available RAM on {{ $labels.instance }} is {{ $value }}%. + description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%. + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - name: network rules: @@ -171,6 +185,7 @@ groups: annotations: summary: Network interface is reporting many receive errors description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} - alert: SendHighErrors expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: "5m" @@ -179,6 +194,7 @@ groups: annotations: summary: Network interface is reporting many transmit errors description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} - alert: ReceiveHighDrop expr: rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: "5m" @@ -187,6 +203,7 @@ groups: annotations: summary: Network interface is reporting many receive drops description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive drops in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} - alert: SendHighDrop expr: rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: "5m" @@ -195,6 +212,7 @@ groups: annotations: summary: Network interface is reporting many transmit drops description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit drops in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} - name: services rules: @@ -206,6 +224,7 @@ groups: annotations: summary: Lot of 4XX errors description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors. + dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }} - alert: 500Errors expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50 for: "15m" @@ -214,6 +233,7 @@ groups: annotations: summary: Lot of 5XX errors description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors. + dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }} - alert: EndpointDown expr: probe_success == 0 for: "2m" @@ -222,6 +242,8 @@ groups: annotations: summary: "Service down" description: "{{ $labels.instance }} is down for more than 2 minutes" + # Redirect to HTTP or DNS dashboard based on vmagent job name + dashboard: '{{ if eq $labels.job "blackbox-http" -}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-dns" -}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{ $labels.instance }}{{- end -}}' - alert: PostfixRejection # As a test, alert when there is any rejection. If too much alert we will raise the threshold expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0 @@ -231,3 +253,4 @@ groups: annotations: summary: Mail rejection description: At least one mail sent from {{ $labels.instance }} have been rejected + dashboard: https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}