Add dashboard links in alerts annotations and better format floats

62282ca9 · Quentin Duchemin · c61b41c3 · 62282ca9
Verified Commit 62282ca9 authored 3 years ago by Quentin Duchemin
--- a/pica-metrologie/vmalert-rules.yml
+++ b/pica-metrologie/vmalert-rules.yml
@@ -2,14 +2,14 @@ groups:
  # The name of the group. Must be unique within a file.
 - name: disk
  rules:
-  - record: instance:node_fs:avail_size
-    expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100
+  - record: instance:node_fs:space_used
+    expr: 100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)
  - record: instance:node_disk:temperature
    expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
    # The name of the alert. Must be a valid metric name.
  - alert: DiskFull
    # The PromQL expression to evaluate
-    expr: instance:node_fs:avail_size < 20
+    expr: instance:node_fs:space_used > 80
    # Alerts are considered firing once they have been returned for this long.
    # Alerts which have not yet fired for long enough are considered pending.
    # If param is omitted or set to 0 then alerts will be immediately considered
@@ -19,44 +19,49 @@ groups:
      severity: warning
    annotations:
      summary: Disk 80% full on {{ $labels.instance }}
-      description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
+      description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
+      dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
  - alert: DiskFull
-    expr: instance:node_fs:avail_size < 5
+    expr: instance:node_fs:space_used > 95
    for: "15m"
    labels:
      severity: critical
    annotations:
      summary: Disk 95% full on {{ $labels.instance }}
-      description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
+      description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
+      dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
  - alert: VMBackupFull
    # Backup storage is always called "save" at Picasoft.
    # pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
    # missing from pve_disk_* but present in pve_storage_info with join on `id`
-    expr: (pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
+    expr: ((pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
    for: "6h"
    labels:
      severity: warning
    annotations:
      summary: Proxmox backup volume 90% full
-      description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
+      description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
+      dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
  - alert: VMStorageSSDFull
    # SSD storage is always called "local" at Picasoft.
-    expr: (pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
+    expr: ((pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
    for: "15m"
    labels:
      severity: critical
    annotations:
      summary: Proxmox SSD volume 90% full
-      description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
+      description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
+      dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
  - alert: VMStorageHDDFull
    # HDD storage always has "hdd" in its name at Picasoft.
-    expr: (pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 > 90
+    expr: ((pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90
    for: "15m"
    labels:
      severity: critical
    annotations:
      summary: Proxmox HDD volume 90% full
-      description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
+      description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
+      dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
  - alert: DiskDamaged
    # Only get values from real disks so ignore VMs
    # This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
@@ -67,6 +72,7 @@ groups:
    annotations:
      summary: Physical disk unhealthy
      description: Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values
+      dashboard: https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }}
  - alert: RaidDegraded
    expr: (node_md_disks - node_md_disks_active) != 0
    labels:
@@ -74,6 +80,7 @@ groups:
    annotations:
      summary: RAID on node {{ $labels.instance }} is in degrade mode
      description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed"
+      dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }}
  - alert: DiskHighTemperature
    expr: instance:node_disk:temperature > 50
    for: "5m"
@@ -81,7 +88,8 @@ groups:
      severity: warning
    annotations:
      summary: Disk temperature > 50°C
-      description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
+      description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
+      dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
  - alert: DiskHighTemperature
    expr: instance:node_disk:temperature > 60
    for: "5m"
@@ -89,7 +97,8 @@ groups:
      severity: critical
    annotations:
      summary: Disk temperature > 60°C
-      description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
+      description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
+      dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}

 - name: cpu
  rules:
@@ -122,7 +131,8 @@ groups:
      severity: warning
    annotations:
      summary: CPU usage over 90%
-      description: CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes
+      description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes
+      dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
  - alert: HighCPUTemperature
    expr: instance:node_cpu_temperature > 60
    for: "5m"
@@ -130,7 +140,8 @@ groups:
      severity: warning
    annotations:
      summary: CPU temperature over 60°C
-      description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
+      description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
+      dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
  - alert: HighCPUTemperature
    expr: instance:node_cpu_temperature > 80
    for: "5m"
@@ -138,7 +149,8 @@ groups:
      severity: warning
    annotations:
      summary: CPU temperature over 80°C
-      description: CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
+      description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
+      dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}

 - name: ram
  rules:
@@ -151,7 +163,8 @@ groups:
      severity: warning
    annotations:
      summary: More than 80% of RAM is used
-      description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
+      description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
+      dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
  - alert: HighRAMUse
    expr: instance:node_memory < 5
    for: "10h"
@@ -159,7 +172,8 @@ groups:
      severity: critical
    annotations:
      summary: More than 95% of RAM is used
-      description: Available RAM on {{ $labels.instance }} is {{ $value }}%.
+      description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
+      dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}

 - name: network
  rules:
@@ -171,6 +185,7 @@ groups:
    annotations:
      summary: Network interface is reporting many receive errors
      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
+      dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
  - alert: SendHighErrors
    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    for: "5m"
@@ -179,6 +194,7 @@ groups:
    annotations:
      summary: Network interface is reporting many transmit errors
      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
+      dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
  - alert: ReceiveHighDrop
    expr: rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
    for: "5m"
@@ -187,6 +203,7 @@ groups:
    annotations:
      summary: Network interface is reporting many receive drops
      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive drops in the last two minutes.'
+      dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
  - alert: SendHighDrop
    expr: rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
    for: "5m"
@@ -195,6 +212,7 @@ groups:
    annotations:
      summary: Network interface is reporting many transmit drops
      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit drops in the last two minutes.'
+      dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}

 - name: services
  rules:
@@ -206,6 +224,7 @@ groups:
    annotations:
      summary: Lot of 4XX errors
      description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
+      dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
  - alert: 500Errors
    expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50
    for: "15m"
@@ -214,6 +233,7 @@ groups:
    annotations:
      summary: Lot of 5XX errors
      description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
+      dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
  - alert: EndpointDown
    expr: probe_success == 0
    for: "2m"
@@ -222,6 +242,8 @@ groups:
    annotations:
      summary: "Service down"
      description: "{{ $labels.instance }} is down for more than 2 minutes"
+      # Redirect to HTTP or DNS dashboard based on vmagent job name
+      dashboard: '{{ if eq $labels.job "blackbox-http" -}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-dns" -}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{ $labels.instance }}{{- end -}}'
  - alert: PostfixRejection
    # As a test, alert when there is any rejection. If too much alert we will raise the threshold
    expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0
@@ -231,3 +253,4 @@ groups:
    annotations:
      summary: Mail rejection
      description: At least one mail sent from {{ $labels.instance }} have been rejected
+      dashboard: https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}