diff --git a/pica-metrologie/vmalert-rules.yml b/pica-metrologie/vmalert-rules.yml index 69282119dcb78110faa8090e61ac8a0eef534f46..a8a34a7b42a180bc7abdce64a079215b538c0a1a 100644 --- a/pica-metrologie/vmalert-rules.yml +++ b/pica-metrologie/vmalert-rules.yml @@ -199,6 +199,15 @@ groups: description: "{{ $labels.instance }} is down for more than 2 minutes" # Redirect to HTTP or DNS dashboard based on vmagent job name dashboard: '{{ if eq $labels.job "blackbox-http" -}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-dns" -}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-mail" -}}https://grafana.picasoft.net/d/VB5CUrn7k/postfix{{- end -}}' + - alert: BlackboxDNSResolutionFailed + expr: probe_http_duration_seconds{phase="resolve"} > 5 + for: "5m" + labels: + severity: warning + annotations: + summary: "DNS resolver seems down" + description: "Taking too much time to resolve {{ $labels.instance }}. Check the health of DNS resolvers on the monitoring machine." + dashboard: 'https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}' - alert: PostfixRejection # As a test, alert when there is any rejection. If too much alert we will raise the threshold expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 1