diff --git a/pica-metrologie/vmalert-rules.yml b/pica-metrologie/vmalert-rules.yml index edd3aea58c0990c031724908eb9a6011a890f55a..5bf8331a71a403df12abdb23fcd8998fa25f7b8b 100644 --- a/pica-metrologie/vmalert-rules.yml +++ b/pica-metrologie/vmalert-rules.yml @@ -2,32 +2,14 @@ groups: # The name of the group. Must be unique within a file. - name: disk rules: - - record: instance:node_fs:space_used - expr: 100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100) - - record: instance:node_disk:temperature - expr: avg(smartmon_temperature_celsius_raw_value) by (instance, disk) # The name of the alert. Must be a valid metric name. - alert: DiskFull - # The PromQL expression to evaluate - expr: instance:node_fs:space_used > 80 - # Alerts are considered firing once they have been returned for this long. - # Alerts which have not yet fired for long enough are considered pending. - # If param is omitted or set to 0 then alerts will be immediately considered - # as firing once they return. + expr: (100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)) > 90 for: "10m" labels: severity: warning annotations: - summary: Disk 80% full on {{ $labels.instance }} - description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full - dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - - alert: DiskFull - expr: instance:node_fs:space_used > 95 - for: "10m" - labels: - severity: critical - annotations: - summary: Disk 95% full on {{ $labels.instance }} + summary: Disk 90% full on {{ $labels.instance }} description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - alert: VMBackupFull @@ -84,16 +66,7 @@ groups: description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed" dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }} - alert: DiskHighTemperature - expr: instance:node_disk:temperature > 50 - for: "5m" - labels: - severity: warning - annotations: - summary: Disk temperature > 50°C - description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes - dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - - alert: DiskHighTemperature - expr: instance:node_disk:temperature > 60 + expr: (avg(smartmon_temperature_celsius_raw_value) by (instance, disk)) > 60 for: "5m" labels: severity: critical @@ -135,15 +108,6 @@ groups: summary: CPU usage over 90% description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - - alert: HighCPUTemperature - expr: instance:node_cpu_temperature > 60 - for: "5m" - labels: - severity: warning - annotations: - summary: CPU temperature over 60°C - description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }} - dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} - alert: HighCPUTemperature expr: instance:node_cpu_temperature > 80 for: "5m" @@ -156,24 +120,13 @@ groups: - name: ram rules: - - record: instance:node_memory - expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 - alert: HighRAMUse - expr: instance:node_memory < 20 + expr: ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100) < 10 for: "10m" labels: severity: warning annotations: - summary: More than 80% of RAM is used - description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%. - dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} - - alert: HighRAMUse - expr: instance:node_memory < 5 - for: "10m" - labels: - severity: critical - annotations: - summary: More than 95% of RAM is used + summary: More than 90% of RAM is used description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%. dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}