Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
Dockerfiles
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
Picasoft
Technique
Dockerfiles
Commits
62282ca9
Verified
Commit
62282ca9
authored
3 years ago
by
Quentin Duchemin
Browse files
Options
Downloads
Patches
Plain Diff
Add dashboard links in alerts annotations and better format floats
parent
c61b41c3
No related branches found
Branches containing commit
No related tags found
1 merge request
!65
Add alerting
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
pica-metrologie/vmalert-rules.yml
+42
-19
42 additions, 19 deletions
pica-metrologie/vmalert-rules.yml
with
42 additions
and
19 deletions
pica-metrologie/vmalert-rules.yml
+
42
−
19
View file @
62282ca9
...
...
@@ -2,14 +2,14 @@ groups:
# The name of the group. Must be unique within a file.
-
name
:
disk
rules
:
-
record
:
instance:node_fs:
avail_size
expr
:
(node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) *
100
-
record
:
instance:node_fs:
space_used
expr
:
100 - (
(node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100
)
-
record
:
instance:node_disk:temperature
expr
:
avg(smartmon_temperature_celsius_raw_value) by (instance, disk)
# The name of the alert. Must be a valid metric name.
-
alert
:
DiskFull
# The PromQL expression to evaluate
expr
:
instance:node_fs:
avail_size <
2
0
expr
:
instance:node_fs:
space_used >
8
0
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
# If param is omitted or set to 0 then alerts will be immediately considered
...
...
@@ -19,44 +19,49 @@ groups:
severity
:
warning
annotations
:
summary
:
Disk 80% full on {{ $labels.instance }}
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
DiskFull
expr
:
instance:node_fs:
avail_size <
5
expr
:
instance:node_fs:
space_used >
9
5
for
:
"
15m"
labels
:
severity
:
critical
annotations
:
summary
:
Disk 95% full on {{ $labels.instance }}
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
description
:
Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
# missing from pve_disk_* but present in pve_storage_info with join on `id`
expr
:
(pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 >
90
expr
:
(
(pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100
)
>
90
for
:
"
6h"
labels
:
severity
:
warning
annotations
:
summary
:
Proxmox backup volume 90% full
description
:
Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description
:
Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
-
alert
:
VMStorageSSDFull
# SSD storage is always called "local" at Picasoft.
expr
:
(pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 >
90
expr
:
(
(pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100
)
>
90
for
:
"
15m"
labels
:
severity
:
critical
annotations
:
summary
:
Proxmox SSD volume 90% full
description
:
Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description
:
Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
-
alert
:
VMStorageHDDFull
# HDD storage always has "hdd" in its name at Picasoft.
expr
:
(pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100 >
90
expr
:
(
(pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100
)
>
90
for
:
"
15m"
labels
:
severity
:
critical
annotations
:
summary
:
Proxmox HDD volume 90% full
description
:
Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ $value }}% full
description
:
Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full
dashboard
:
https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }}
-
alert
:
DiskDamaged
# Only get values from real disks so ignore VMs
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
...
...
@@ -67,6 +72,7 @@ groups:
annotations
:
summary
:
Physical disk unhealthy
description
:
Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values
dashboard
:
https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }}
-
alert
:
RaidDegraded
expr
:
(node_md_disks - node_md_disks_active) !=
0
labels
:
...
...
@@ -74,6 +80,7 @@ groups:
annotations
:
summary
:
RAID on node {{ $labels.instance }} is in degrade mode
description
:
"
Degraded
RAID
array
{{
$labels.device
}}
on
{{
$labels.instance
}}
:
{{
$value
}}
disks
failed"
dashboard
:
https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }}
-
alert
:
DiskHighTemperature
expr
:
instance:node_disk:temperature >
50
for
:
"
5m"
...
...
@@ -81,7 +88,8 @@ groups:
severity
:
warning
annotations
:
summary
:
Disk temperature > 50°C
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
alert
:
DiskHighTemperature
expr
:
instance:node_disk:temperature >
60
for
:
"
5m"
...
...
@@ -89,7 +97,8 @@ groups:
severity
:
critical
annotations
:
summary
:
Disk temperature > 60°C
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ $value }}°C for more than 5 minutes
description
:
Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
name
:
cpu
rules
:
...
...
@@ -122,7 +131,8 @@ groups:
severity
:
warning
annotations
:
summary
:
CPU usage over 90%
description
:
CPU use percent is {{ $value }}% on {{ $labels.instance }} for the past 30 minutes
description
:
CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
HighCPUTemperature
expr
:
instance:node_cpu_temperature >
60
for
:
"
5m"
...
...
@@ -130,7 +140,8 @@ groups:
severity
:
warning
annotations
:
summary
:
CPU temperature over 60°C
description
:
CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description
:
CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard
:
https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }}
-
alert
:
HighCPUTemperature
expr
:
instance:node_cpu_temperature >
80
for
:
"
5m"
...
...
@@ -138,7 +149,8 @@ groups:
severity
:
warning
annotations
:
summary
:
CPU temperature over 80°C
description
:
CPU temperature averaged over cores is {{ $value }}°C on {{ $labels.instance }}
description
:
CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }}
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
name
:
ram
rules
:
...
...
@@ -151,7 +163,8 @@ groups:
severity
:
warning
annotations
:
summary
:
More than 80% of RAM is used
description
:
Available RAM on {{ $labels.instance }} is {{ $value }}%.
description
:
Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
alert
:
HighRAMUse
expr
:
instance:node_memory <
5
for
:
"
10h"
...
...
@@ -159,7 +172,8 @@ groups:
severity
:
critical
annotations
:
summary
:
More than 95% of RAM is used
description
:
Available RAM on {{ $labels.instance }} is {{ $value }}%.
description
:
Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%.
dashboard
:
https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }}
-
name
:
network
rules
:
...
...
@@ -171,6 +185,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many receive errors
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
receive
errors
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
alert
:
SendHighErrors
expr
:
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) >
0.01
for
:
"
5m"
...
...
@@ -179,6 +194,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many transmit errors
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
transmit
errors
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
alert
:
ReceiveHighDrop
expr
:
rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) >
0.01
for
:
"
5m"
...
...
@@ -187,6 +203,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many receive drops
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
receive
drops
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
alert
:
SendHighDrop
expr
:
rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) >
0.01
for
:
"
5m"
...
...
@@ -195,6 +212,7 @@ groups:
annotations
:
summary
:
Network interface is reporting many transmit drops
description
:
'
{{
$labels.instance
}}
interface
{{
$labels.device
}}
has
encountered
{{
printf
"%.0f"
$value
}}
transmit
drops
in
the
last
two
minutes.'
dashboard
:
https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }}
-
name
:
services
rules
:
...
...
@@ -206,6 +224,7 @@ groups:
annotations
:
summary
:
Lot of 4XX errors
description
:
Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard
:
https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
-
alert
:
500Errors
expr
:
increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) >
50
for
:
"
15m"
...
...
@@ -214,6 +233,7 @@ groups:
annotations
:
summary
:
Lot of 5XX errors
description
:
Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors.
dashboard
:
https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }}
-
alert
:
EndpointDown
expr
:
probe_success ==
0
for
:
"
2m"
...
...
@@ -222,6 +242,8 @@ groups:
annotations
:
summary
:
"
Service
down"
description
:
"
{{
$labels.instance
}}
is
down
for
more
than
2
minutes"
# Redirect to HTTP or DNS dashboard based on vmagent job name
dashboard
:
'
{{
if
eq
$labels.job
"blackbox-http"
-}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{
$labels.instance
}}{{-
else
if
eq
$labels.job
"blackbox-dns"
-}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{
$labels.instance
}}{{-
end
-}}'
-
alert
:
PostfixRejection
# As a test, alert when there is any rejection. If too much alert we will raise the threshold
expr
:
rate(postfix_smtpd_messages_rejected_total[5m]) >
0
...
...
@@ -231,3 +253,4 @@ groups:
annotations
:
summary
:
Mail rejection
description
:
At least one mail sent from {{ $labels.instance }} have been rejected
dashboard
:
https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment