Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Picasoft
Technique
Dockerfiles
Commits
62282ca9
Verified
Commit
62282ca9
authored
Aug 29, 2021
by
Quentin Duchemin
Browse files
Add dashboard links in alerts annotations and better format floats
parent
c61b41c3
Changes
1
Hide whitespace changes
Inline
Side-by-side
pica-metrologie/vmalert-rules.yml
View file @
62282ca9
...
...
@@ -2,14 +2,14 @@ groups:
# The name of the group. Must be unique within a file.
-
name
:
disk
rules
:
-
record
:
instance:node_fs:
avail_size
expr
:
(node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) *
100
-
record
:
instance:node_fs:
space_used
expr
:
100 - (
(node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100
)
-
record
:
instance:node_disk:temperature
expr
:
avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
# The name of the alert. Must be a valid metric name.
-
alert
:
DiskFull
# The PromQL expression to evaluate
expr
:
instance:node_fs:
avail_size <
2
0
expr
:
instance:node_fs:
space_used >
8
0
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
...
...
@@ -19,44 +19,49 @@ groups:
severity
:
warning
annotations
:
summary
:
Disk 80% full on {{ $labels.instance }}
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
DiskFull
expr
:
instance:node_fs:
avail_size <
5
expr
:
instance:node_fs:
space_used >
9
5
for
:
"
15m"
labels
:
severity
:
critical
annotations
:
summary
:
Disk 95% full on {{ $labels.instance }}
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
# missing from pve_disk_* but present in pve_storage_info with join on `id`
expr
:
(pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 >
90
expr
:
(
(pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100
)
>
90
for
:
"
6h"
labels
:
severity
:
warning
annotations
:
summary
:
Proxmox backup volume 90% full
description
:
Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description
:
Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
-
alert
:
VMStorageSSDFull
# SSD storage is always called "local" at Picasoft.
expr
:
(pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 >
90
expr
:
(
(pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100
)
>
90
for
:
"
15m"
labels
:
severity
:
critical
annotations
:
summary
:
Proxmox SSD volume 90% full
description
:
Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description
:
Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
-
alert
:
VMStorageHDDFull
# HDD storage always has "hdd" in its name at Picasoft.
expr
:
(pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 >
90
expr
:
(
(pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100
)
>
90
for
:
"
15m"
labels
:
severity
:
critical
annotations
:
summary
:
Proxmox HDD volume 90% full
description
:
Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description
:
Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
-
alert
:
DiskDamaged
# Only get values from real disks so ignore VMs
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
...
...
@@ -67,6 +72,7 @@ groups:
annotations
:
summary
:
Physical disk unhealthy
description
:
Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values
dashboard
:
https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }}
-
alert
:
RaidDegraded
expr
:
(node_md_disks - node_md_disks_active) !=
0
labels
:
...
...
@@ -74,6 +80,7 @@ groups:
annotations
:
summary
:
RAID on node {{ $labels.instance }} is in degrade mode
description
:
"
Degraded
RAID
array
{{
$labels.device
}}
on
{{
$labels.instance
}}
:
{{
$value
}}
disks
failed"
dashboard
:
https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }}
-
alert
:
DiskHighTemperature
expr
:
instance:node_disk:temperature >
50
for
:
"
5m"
...
...
@@ -81,7 +88,8 @@ groups:
severity
:
warning
annotations
:
summary
:
Disk temperature > 50°C
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
alert
:
DiskHighTemperature
expr
:
instance:node_disk:temperature >
60
for
:
"
5m"
...
...
@@ -89,7 +97,8 @@ groups:
severity
:
critical
annotations
:
summary
:
Disk temperature > 60°C
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
name
:
cpu
rules
:
...
...
@@ -122,7 +131,8 @@ groups:
severity
:
warning
annotations
:
summary
:
CPU usage over 90%
description
:
CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes
description
:
CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
HighCPUTemperature
expr
:
instance:node_cpu_temperature >
60
for
:
"
5m"
...
...
@@ -130,7 +140,8 @@ groups:
severity
:
warning
annotations
:
summary
:
CPU temperature over 60°C
description
:
CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description
:
CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
HighCPUTemperature
expr
:
instance:node_cpu_temperature >
80
for
:
"
5m"
...
...
@@ -138,7 +149,8 @@ groups:
severity
:
warning
annotations
:
summary
:
CPU temperature over 80°C
description
:
CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description
:
CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
name
:
ram
rules
:
...
...
@@ -151,7 +163,8 @@ groups:
severity
:
warning
annotations
:
summary
:
More than 80% of RAM is used
description
:
Available RAM on {{ $labels.instance }} is {{ $value }}%.
description
:
Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
alert
:
HighRAMUse
expr
:
instance:node_memory <
5
for
:
"
10h"
...
...
@@ -159,7 +172,8 @@ groups:
severity
:
critical
annotations
:
summary
:
More than 95% of RAM is used
description
:
Available RAM on {{ $labels.instance }} is {{ $value }}%.
description
:
Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
name
:
network
rules
:
...
...
@@ -171,6 +185,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many receive errors
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
receive
errors
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
alert
:
SendHighErrors
expr
:
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) >
0.01
for
:
"
5m"
...
...
@@ -179,6 +194,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many transmit errors
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
transmit
errors
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
alert
:
ReceiveHighDrop
expr
:
rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) >
0.01
for
:
"
5m"
...
...
@@ -187,6 +203,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many receive drops
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
receive
drops
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
alert
:
SendHighDrop
expr
:
rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) >
0.01
for
:
"
5m"
...
...
@@ -195,6 +212,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many transmit drops
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
transmit
drops
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
name
:
services
rules
:
...
...
@@ -206,6 +224,7 @@ groups:
annotations
:
summary
:
Lot of 4XX errors
description
:
Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard
:
https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
-
alert
:
500Errors
expr
:
increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) >
50
for
:
"
15m"
...
...
@@ -214,6 +233,7 @@ groups:
annotations
:
summary
:
Lot of 5XX errors
description
:
Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard
:
https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
-
alert
:
EndpointDown
expr
:
probe_success ==
0
for
:
"
2m"
...
...
@@ -222,6 +242,8 @@ groups:
annotations
:
summary
:
"
Service
down"
description
:
"
{{
$labels.instance
}}
is
down
for
more
than
2
minutes"
# Redirect to HTTP or DNS dashboard based on vmagent job name
dashboard
:
'
{{
if
eq
$labels.job
"blackbox-http"
-}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{
$labels.instance
}}{{-
else
if
eq
$labels.job
"blackbox-dns"
-}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{
$labels.instance
}}{{-
end
-}}'
-
alert
:
PostfixRejection
# As a test, alert when there is any rejection. If too much alert we will raise the threshold
expr
:
rate(postfix_smtpd_messages_rejected_total[5m]) >
0
...
...
@@ -231,3 +253,4 @@ groups:
annotations
:
summary
:
Mail rejection
description
:
At least one mail sent from {{ $labels.instance }} have been rejected
dashboard
:
https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment