diff --git a/pica-metrologie/README.md b/pica-metrologie/README.md index 6c59e94e0fec2d8c5e10bc11089254595da98978..b18b4e8d1314ba8d30ff70d823871123b75eee77 100644 --- a/pica-metrologie/README.md +++ b/pica-metrologie/README.md @@ -1,12 +1,19 @@ +# Doc dépréciée + +Elle sera mise à jour dans quelques jours après la fusion, pour que le wiki puisse référencer les nouveaux fichiers et inversement que cette doc soit plus synthétique et fasse référence au wiki. + # Stack de métrologie Ce dossier contient les ressources nécessaires pour déployer la partie serveur de la stack de métrologie de Picasoft, à savoir : - Victoria Metrics pour le stockage de métriques +- BlackBox Exporter pour le probing des services webs et des serveurs DNS - `vmagent` pour l'ingestion de métriques +- `vmalert` pour la génération d'alertes +- AlertManager pour la gestion et la transmission des alertes - Grafana pour visualiser les métriques -Pour des raisons de simplicités mais aussi de sécurité, ces 4 services sont déployés sur la même machine, en partageant un même réseau Docker. +Pour des raisons de simplicités mais aussi de sécurité, ces services sont déployés sur la même machine, en partageant un même réseau Docker. Il est fortement recommandé de consulter la [documentation associée](https://wiki.picasoft.net/doku.php?id=technique:adminsys:monitoring:metrologie:stack-picasoft) pour comprendre l'architecture de cette stack de métrologie. @@ -47,8 +54,6 @@ Pour une meilleure fiabilité, le dossier `/vmagent-remotewrite-data` qui stocke Grafana est l'outil de visualisation de métriques [utilisé par Picasoft](https://wiki.picasoft.net/doku.php?id=technique:adminsys:monitoring:metrologie:grafana). -Attention : même si l'authentification LDAP est activée, elle semble ne pas fonctionner : la connexion ne fonctionne que grâce à l'utilisateur administrateur. Voir [cette page](https://grafana.com/docs/grafana/latest/auth/ldap/#ldap-debug-view) pour investiguer et régler le problème. - #### Emplacements La configuration est réalisée : @@ -65,10 +70,6 @@ Il y a trois types d'utilisateurs : - Les utilisateurs LDAP - Les utilisateurs créés manuellement, non utilisés par Picasoft -#### TODO - -Le `sed` utilisé pour injecter les secrets dans l'entrypoint est dégueulasse, si jamais il y a certains caractères dans les mots de passe ça marchera pas, il faut faire mieux, en Python par exemple. - ## Mise à jour Pour Victoria Metrics et `vmagent` il suffit de changer les tags utilisés dans le fichier `docker-compose.yml`. On fera attention à utilise la même version pour les deux outils. diff --git a/pica-metrologie/alertmanager/Dockerfile b/pica-metrologie/alertmanager/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..bb58c095c4d9b162c41d83520ebdecbbd9d78211 --- /dev/null +++ b/pica-metrologie/alertmanager/Dockerfile @@ -0,0 +1,13 @@ +ARG VERSION=v0.22.2 + +FROM prom/alertmanager:${VERSION} + +COPY ./entrypoint.sh /entrypoint.sh +COPY ./templates.tpl /config/templates.tpl + +# Initial image uses user nobody which cannot chmod nor sed +USER root +RUN chmod +x /entrypoint.sh + +ENTRYPOINT [ "/entrypoint.sh" ] +CMD [ "--config.file=/etc/amtool/config.yml", "--storage.path=/alertmanager" ] diff --git a/pica-metrologie/alertmanager/alertmanager.yml b/pica-metrologie/alertmanager/alertmanager.yml new file mode 100644 index 0000000000000000000000000000000000000000..429258de22f52a522cb282a6530c8da62b182218 --- /dev/null +++ b/pica-metrologie/alertmanager/alertmanager.yml @@ -0,0 +1,74 @@ +global: + slack_api_url: '$MATTERMOST_WEBHOOK' + +# The root route on which each incoming alert enters. +route: + # The root route must not have any matchers as it is the entry point for + # all alerts. It needs to have a receiver configured so alerts that do not + # match any of the sub-routes are sent to someone. + receiver: 'mattermost' + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts alertname=LatencyHigh would be batched into a single group. + group_by: ['instance', 'alertname'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 24h + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_matchers: + - severity="critical" + target_matchers: + - severity="warning" + # Apply inhibition if the alertname is the same. + # CAUTION: + # If all label names listed in `equal` are missing + # from both the source and target alerts, + # the inhibition rule will apply! + equal: ['alertname'] + +receivers: +- name: 'mattermost' + slack_configs: + - channel: '$MATTERMOST_CHANNEL' + icon_emoji: ":thaenkin:" + username: "AlertManager" + color: '{{ if eq .CommonLabels.severity "warning" -}}warning{{- else if eq .CommonLabels.severity "critical" -}}danger{{- end -}}' + title: "{{ .CommonLabels.alertname }}" + text: "**{{ .CommonLabels.severity }}** : {{ .CommonAnnotations.summary }}" + fields: + - title: "Description" + value: "{{ .CommonAnnotations.description }}" + # Would be better with buttons but Slack new buttons are not + # yet compatible with Mattermost interactive messages model + - title: ":chart_with_upwards_trend: See the data" + value: "[Open Grafana]({{ (index .Alerts 0).Annotations.dashboard }})" + short: true + - title: ':no_bell: Silence' + value: '[Silence the alert]({{ template "__alert_silence_link" . }})' + short: true + - title: ':question: Documentation' + value: "[See the wiki](https://wiki.picasoft.net/doku.php?id=technique:adminsys:monitoring:metrologie:stack-picasoft)" + short: true + - title: ':fire: See all alerts' + value: "[AlertManager WebUI](https://alertmanager.picasoft.net)" + short: true + +templates: +- /config/templates.tpl diff --git a/pica-metrologie/alertmanager/entrypoint.sh b/pica-metrologie/alertmanager/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..6fe5e4bf23b243c473305ab8abec1dd2a9320eb3 --- /dev/null +++ b/pica-metrologie/alertmanager/entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +if [ -z "${MATTERMOST_WEBHOOK}" ]; then + echo "MATTERMOST_WEBHOOK is mandatory, please provide it!" +fi + +if [ -z "${MATTERMOST_CHANNEL}" ]; then + echo "MATTERMOST_CHANNEL is mandatory, please provide it!" +fi + +# We use a busybox image, no way to use envsubst and Prometheus team had +# a long debate about whether env variable should be used for configuration, +# and they voted no. See https://github.com/prometheus/prometheus/issues/2357 for exemple. +# We have no other trivial way if we want to commit the configuration file without secrets inside. +mkdir -p /etc/amtool +cp /config/alertmanager.yml /etc/amtool/config.yml +sed -i "s@\$MATTERMOST_WEBHOOK@${MATTERMOST_WEBHOOK}@g" /etc/amtool/config.yml +sed -i "s@\$MATTERMOST_CHANNEL@${MATTERMOST_CHANNEL}@g" /etc/amtool/config.yml + +# Substitue shell with alertmanager + arguments passed in Docker CMD +exec /bin/alertmanager $@ diff --git a/pica-metrologie/alertmanager/templates.tpl b/pica-metrologie/alertmanager/templates.tpl new file mode 100644 index 0000000000000000000000000000000000000000..38636a9c54af906644d8105131920d5ef2bceac1 --- /dev/null +++ b/pica-metrologie/alertmanager/templates.tpl @@ -0,0 +1,9 @@ +{{ define "__alert_silence_link" -}} + {{ .ExternalURL }}/#/silences/new?filter=%7B + {{- range .CommonLabels.SortedPairs -}} + {{- if ne .Name "alertname" -}} + {{- .Name }}%3D'{{- .Value -}}'%2C%20 + {{- end -}} + {{- end -}} + alertname%3D'{{ .CommonLabels.alertname }}'%7D +{{- end }} diff --git a/pica-metrologie/blackbox.yml b/pica-metrologie/blackbox.yml new file mode 100644 index 0000000000000000000000000000000000000000..e85509fe704b5cfa2919564d8d536e793e3e5c16 --- /dev/null +++ b/pica-metrologie/blackbox.yml @@ -0,0 +1,24 @@ +modules: + http_2xx: + # Probe web services and give up after 10s of no response + prober: http + timeout: 10s + http: + method: GET + # Because Traefik could redirect to + # HTTPS, we need to follow to see if service is up + follow_redirects: true + headers: + Origin: blackbox.picasoft.net + # Docker often blocks v6 without further configuration, + # prevent false failures using v4 by default + preferred_ip_protocol: ip4 + # All our services must be HTTPS + fail_if_not_ssl: true + + dns_soa: + # To detect DNS servers failures + prober: dns + dns: + query_name: picasoft.net + query_type: SOA diff --git a/pica-metrologie/docker-compose.yml b/pica-metrologie/docker-compose.yml index 7872aa48c0333f759b31211cfcd68bc649b035b4..86d374f40218dd8a9c44aba85b91c391808cb37b 100644 --- a/pica-metrologie/docker-compose.yml +++ b/pica-metrologie/docker-compose.yml @@ -13,6 +13,8 @@ volumes: name: victoria-metrics vmagent-buffer: name: vmagent-buffer + alertmanager: + name: alertmanager services: grafana: @@ -52,6 +54,7 @@ services: - metrics restart: unless-stopped + # Stores all metrics in a TSDB compatible with PromQL queries vmagent: image: victoriametrics/vmagent:v1.63.0 container_name: vmagent @@ -67,3 +70,79 @@ services: networks: - metrics restart: unless-stopped + + # Fires alerts based on custom rules (like disk > 80% etc) + vmalert: + image: victoriametrics/vmalert:v1.62.0 + container_name: vmalert + command: + - "-rule=/config/vmalert-rules.yml" + # Where to read metrics + - "-datasource.url=http://victoria-metrics:8428" + # Where to write and read alert states, to keep + # state during restart, as vmalert stores states in memory + - "-remoteWrite.url=http://victoria-metrics:8428" + - "-remoteRead.url=http://victoria-metrics:8428" + # Where to send alert when they must be triggered + - "-notifier.url=http://alertmanager:9093" + # HTTP server for vmalert's own metrics + - "-httpListenAddr=:8880" + # By default, evaluate rules every 1 minute + - "-evaluationInterval=1m" + - "-loggerOutput=stdout" + volumes: + - ./vmalert-rules.yml:/config/vmalert-rules.yml + networks: + - metrics + restart: unless-stopped + + # Receives alerts and decides what to do, e.g. send a mail or a Mattermost message + # Takes care of deduplication etc + alertmanager: + image: registry.picasoft.net/pica-alertmanager:v0.22.2 + build: ./alertmanager + container_name: alertmanager + volumes: + - ./alertmanager/alertmanager.yml:/config/alertmanager.yml + # Unnamed volume declared in original Dockerfile + - alertmanager:/alertmanager + env_file: ./secrets/alertmanager.secrets + labels: + # For alertmanager web interface + traefik.http.routers.alertmanager.entrypoints: websecure + traefik.http.routers.alertmanager.rule: "Host(`alertmanager.picasoft.net`)" + traefik.http.routers.alertmanager.service: alertmanager + traefik.http.routers.alertmanager.middlewares: "alertmanager-auth@docker" + traefik.http.middlewares.alertmanager-auth.basicauth.users: "${ALERTMANAGER_AUTH}" + traefik.http.services.alertmanager.loadbalancer.server.port: 9093 + traefik.enable: true + command: + - "--config.file=/etc/amtool/config.yml" + - "--storage.path=/alertmanager" + - "--web.external-url=https://alertmanager.picasoft.net" + networks: + - metrics + - proxy + restart: unless-stopped + + # Monitors HTTP or DNS endpoints and store results in VictoriaMetrics + # Very useful to know when a service is down + blackbox: + image: prom/blackbox-exporter:v0.19.0 + container_name: blackbox + command: + - "--config.file=/config/blackbox.yml" + volumes: + - ./blackbox.yml:/config/blackbox.yml + networks: + - metrics + - proxy + labels: + traefik.http.routers.blackbox-exporter.entrypoints: websecure + traefik.http.routers.blackbox-exporter.rule: "Host(`blackbox.picasoft.net`)" + traefik.http.routers.blackbox-exporter.service: blackbox-exporter + traefik.http.routers.blackbox-exporter.middlewares: "blackbox-exporter-auth@docker" + traefik.http.middlewares.blackbox-exporter-auth.basicauth.users: "${METRICS_AUTH}" + traefik.http.services.blackbox-exporter.loadbalancer.server.port: 9115 + traefik.enable: true + restart: unless-stopped diff --git a/pica-metrologie/secrets/alertmanager.secrets.example b/pica-metrologie/secrets/alertmanager.secrets.example new file mode 100644 index 0000000000000000000000000000000000000000..946983fc6309296f9bfc3c0e7e615deca109f80c --- /dev/null +++ b/pica-metrologie/secrets/alertmanager.secrets.example @@ -0,0 +1,4 @@ +# See https://team.picasoft.net/picasoft/integrations/incoming_webhooks +MATTERMOST_WEBHOOK=https://team.picasoft.net/hooks/<key> +# Use the channel key in its URL, team-technique is a good default +MATTERMOST_CHANNEL=team-technique diff --git a/pica-metrologie/secrets/exporters-auth.secrets.example b/pica-metrologie/secrets/exporters-auth.secrets.example index 78949a499be642492e7e2615689ab80bd7d635df..ec0435c490532069682b115aebb88d7737dd8fd6 100644 --- a/pica-metrologie/secrets/exporters-auth.secrets.example +++ b/pica-metrologie/secrets/exporters-auth.secrets.example @@ -22,3 +22,5 @@ PEERTUBE_METRICS_USER=peertube PEERTUBE_METRICS_PASSWORD=superpassword POSTFIX_METRICS_USER=peertube POSTFIX_METRICS_PASSWORD=superpassword +BLACKBOX_METRICS_USER=blackbox +BLACKBOX_METRICS_PASSWORD=superpassword diff --git a/pica-metrologie/vmagent-prom.yml b/pica-metrologie/vmagent-prom.yml index 705af83238da2d584f7e9ac4b3171c5611d7c2ad..5d44fe7ff17c33b947a3f76e2180616295dacc40 100644 --- a/pica-metrologie/vmagent-prom.yml +++ b/pica-metrologie/vmagent-prom.yml @@ -24,7 +24,6 @@ scrape_configs: - "voice.picasoft.net" # Scrape CodiMD metrics - job_name: codimd - honor_timestamps: true metrics_path: "/metrics/codimd" scheme: "https" basic_auth: @@ -33,13 +32,7 @@ scrape_configs: static_configs: - targets: - "md.picasoft.net" - relabel_configs: - - source_labels: [__address__] - regex: ".*" - target_label: instance - replacement: "md.picasoft.net" - job_name: codimd-router - honor_timestamps: true metrics_path: /metrics/router scheme: https basic_auth: @@ -48,11 +41,6 @@ scrape_configs: static_configs: - targets: - "md.picasoft.net" - relabel_configs: - - source_labels: [__address__] - regex: ".*" - target_label: instance - replacement: "md.picasoft.net" # Scrape PrivateBin metrics - job_name: privatebin metrics_path: /metrics.php @@ -63,11 +51,6 @@ scrape_configs: static_configs: - targets: - "paste.picasoft.net" - relabel_configs: - - source_labels: [__address__] - regex: ".*" - target_label: instance - replacement: "paste.picasoft.net" # Scrape Mattermost metrics - job_name: mattermost scheme: "https" @@ -131,6 +114,87 @@ scrape_configs: static_configs: - targets: - "mail.picasoft.net" + # Srape metrics about Picasoft services + # via Blackbox Exporter + - job_name: blackbox-http + scheme: "https" + basic_auth: + username: "%{BLACKBOX_METRICS_USER}" + password: "%{BLACKBOX_METRICS_PASSWORD}" + # Blackbox servers metrics under /probe + metrics_path: /probe + # See blackbox.yml : `module` is passed as GET parameter + # Normally the target (i.e. team.picasoft.net) is also passed as a GET parameter + # so that the request looks like : https://probe.picasoft.net/probe?target=team.picasoft.net&module=http_2xx + # Problem is we would have to create as much jobs as targets, which is hard to read. + # So we use static_configs targets and relabelling to do so. Credits to https://prometheus.io/docs/guides/multi-target-exporter/ + params: + module: [http_2xx] + static_configs: + - targets: + - team.picasoft.net + - pad.picasoft.net + - wiki.picasoft.net + - kanban.picasoft.net + - cloudcet.picasoft.net + - uploads.picasoft.net + - www.picasoft.net + - week.pad.picasoft.net + - doc.picasoft.net + - school.picasoft.net + - radio.picasoft.net + - culture.picasoft.net + - blog.picasoft.net + - voice.picasoft.net + - mobilizon.picasoft.net + - board.picasoft.net + - md.picasoft.net + - impactometre.fr + - paste.picasoft.net + - mastogem.picasoft.net + - tube.picasoft.net + - drop.picasoft.net + - podcast.picasoft.net + - grafana.picasoft.net + - cloud.picasoft.net + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox.picasoft.net # The blackbox exporter’s real hostname:port. + # Srape metrics about Picasoft DNS servers + - job_name: blackbox-dns + scheme: "https" + basic_auth: + username: "%{BLACKBOX_METRICS_USER}" + password: "%{BLACKBOX_METRICS_PASSWORD}" + metrics_path: /probe + params: + module: [dns_soa] + static_configs: + - targets: + - 91.224.148.84 #ns01.picasoft.net + - 91.224.148.85 #ns02.picasoft.net + - 51.158.76.113 #ns03.picasoft.net + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox.picasoft.net + # Scrape metrics about Blackbox itself + - job_name: blackbox + scheme: "https" + basic_auth: + username: "%{BLACKBOX_METRICS_USER}" + password: "%{BLACKBOX_METRICS_PASSWORD}" + metrics_path: /metrics + static_configs: + - targets: + - blackbox.picasoft.net # Scrape Picasoft servers node-exporter - job_name: "pica01" static_configs: diff --git a/pica-metrologie/vmalert-rules.yml b/pica-metrologie/vmalert-rules.yml new file mode 100644 index 0000000000000000000000000000000000000000..5bf8331a71a403df12abdb23fcd8998fa25f7b8b --- /dev/null +++ b/pica-metrologie/vmalert-rules.yml @@ -0,0 +1,211 @@ +groups: + # The name of the group. Must be unique within a file. +- name: disk + rules: + # The name of the alert. Must be a valid metric name. + - alert: DiskFull + expr: (100 - ((node_filesystem_avail_bytes{fstype=~"ext.|xfs"} / node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100)) > 90 + for: "10m" + labels: + severity: warning + annotations: + summary: Disk 90% full on {{ $labels.instance }} + description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} + - alert: VMBackupFull + # Backup storage is always called "save" at Picasoft. + # pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector + # missing from pve_disk_* but present in pve_storage_info with join on `id` + expr: ((pve_disk_usage_bytes{id=~"storage/.+/save"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90 + for: "6h" + labels: + severity: warning + annotations: + summary: Proxmox backup volume 90% full + description: Proxmox backup volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }} + - alert: VMStorageSSDFull + # SSD storage is always called "local" at Picasoft. + expr: ((pve_disk_usage_bytes{id=~"storage/.+/local"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90 + for: "10m" + labels: + severity: critical + annotations: + summary: Proxmox SSD volume 90% full + description: Proxmox SSD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }} + - alert: VMStorageHDDFull + # HDD storage always has "hdd" in its name at Picasoft. + expr: ((pve_disk_usage_bytes{id=~"storage/.+/.+hdd"} / pve_disk_size_bytes * on (id) group_left(storage) pve_storage_info) * 100) > 90 + for: "10m" + labels: + severity: critical + annotations: + summary: Proxmox HDD volume 90% full + description: Proxmox HDD volume ({{ $labels.storage }}) on {{ $labels.instance }} is {{ printf "%.0f" $value }}% full + dashboard: https://grafana.picasoft.net/d/proxmox/proxmox?var-instance={{ $labels.instance }} + - alert: DiskDamaged + # Only get values from real disks so ignore VMs + # This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix + # We must add new machines here + expr: smartmon_device_smart_healthy{instance=~"alice|bob"} != 1 + for: "1m" + labels: + severity: critical + annotations: + summary: Physical disk unhealthy + description: Disk {{ $labels.disk }} on machine {{ $labels.instance }} in marked unhealthy in S.M.A.R.T values + dashboard: https://grafana.picasoft.net/d/PkPI4xGWz/s-m-a-r-t-info?var-node={{ $labels.instance }} + - alert: RaidDegraded + expr: (node_md_disks - node_md_disks_active) != 0 + for: "1m" + labels: + severity: warning + annotations: + summary: RAID on node {{ $labels.instance }} is in degrade mode + description: "Degraded RAID array {{ $labels.device }} on {{ $labels.instance }} : {{ $value }} disks failed" + dashboard: https://grafana.picasoft.net/d/iwR8rQBZk/raid-state?var-node={{ $labels.instance }} + - alert: DiskHighTemperature + expr: (avg(smartmon_temperature_celsius_raw_value) by (instance, disk)) > 60 + for: "5m" + labels: + severity: critical + annotations: + summary: Disk temperature > 60°C + description: Disk {{ $labels.disk }} on {{ $labels.instance }} at {{ printf "%.0f" $value }}°C for more than 5 minutes + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} + +- name: cpu + rules: + # Records are useful to pre-compute metrics + # and re-use them in alerting rules + # The count of CPUs per node, useful for getting CPU time as a percent of total. + - record: instance:node_cpus:count + expr: > + count without (cpu, mode) ( + node_cpu_seconds_total{mode="idle"} + ) + # CPU in use by mode. + - record: instance_mode:node_cpu_seconds:rate1m + expr: > + sum without (cpu) ( + rate(node_cpu_seconds_total[1m]) + ) + # CPU in use ratio. + - record: instance:node_cpu_utilization:ratio + expr: > + sum without (mode) ( + instance_mode:node_cpu_seconds:rate1m{mode!="idle"} + ) / instance:node_cpus:count + - record: instance:node_cpu_temperature + expr: avg(node_hwmon_temp_celsius) by (instance) + - alert: HighCPU + expr: instance:node_cpu_utilization:ratio * 100 > 90 + for: "10m" + labels: + severity: warning + annotations: + summary: CPU usage over 90% + description: CPU use percent is {{ printf "%.0f" $value }}% on {{ $labels.instance }} for the past 30 minutes + dashboard: https://grafana.picasoft.net/d/VIb73SGWa/server-overview?var-node={{ $labels.instance }} + - alert: HighCPUTemperature + expr: instance:node_cpu_temperature > 80 + for: "5m" + labels: + severity: warning + annotations: + summary: CPU temperature over 80°C + description: CPU temperature averaged over cores is {{ printf "%.0f" $value }}°C on {{ $labels.instance }} + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} + +- name: ram + rules: + - alert: HighRAMUse + expr: ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100) < 10 + for: "10m" + labels: + severity: warning + annotations: + summary: More than 90% of RAM is used + description: Available RAM on {{ $labels.instance }} is {{ printf "%.0f" $value }}%. + dashboard: https://grafana.picasoft.net/d/moX2wwfZk/temperatures?var-node={{ $labels.instance }} + +- name: network + rules: + - alert: ReceiveHighErrors + expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: "5m" + labels: + severity: warning + annotations: + summary: Network interface is reporting many receive errors + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} + - alert: SendHighErrors + expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: "5m" + labels: + severity: warning + annotations: + summary: Network interface is reporting many transmit errors + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} + - alert: ReceiveHighDrop + expr: rate(node_network_receive_drop_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: "5m" + labels: + severity: warning + annotations: + summary: Network interface is reporting many receive drops + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive drops in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} + - alert: SendHighDrop + expr: rate(node_network_transmit_drop_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: "5m" + labels: + severity: warning + annotations: + summary: Network interface is reporting many transmit drops + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit drops in the last two minutes.' + dashboard: https://grafana.picasoft.net/d/QPF5l5uZa/network?var-node={{ $labels.instance }} + +- name: services + rules: + - alert: 404Errors + expr: increase(traefik_service_requests_total{code=~"4[0-9][0-8]"}[15m]) > 50 + for: "2m" + labels: + severity: warning + annotations: + summary: Lot of 4XX errors + description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors. + dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }} + - alert: 500Errors + expr: increase(traefik_service_requests_total{code=~"5[0-9]{2}"}[15m]) > 50 + for: "2m" + labels: + severity: warning + annotations: + summary: Lot of 5XX errors + description: Service {{ $labels.service_name }} running on {{ $labels.instance }} encoutering lot of {{ $labels.code }} errors. + dashboard: https://grafana.picasoft.net/d/3ipsWfViz/traefik?var-node={{ $labels.instance }}&var-service={{ $labels.service_name }} + - alert: EndpointDown + expr: probe_success == 0 + for: "2m" + labels: + severity: critical + annotations: + summary: "Service down" + description: "{{ $labels.instance }} is down for more than 2 minutes" + # Redirect to HTTP or DNS dashboard based on vmagent job name + dashboard: '{{ if eq $labels.job "blackbox-http" -}}https://grafana.picasoft.net/d/8BOa8W47z/services-web?var-instance={{ $labels.instance }}{{- else if eq $labels.job "blackbox-dns" -}}https://grafana.picasoft.net/d/1twteMV7k/serveurs-dns?var-instance={{ $labels.instance }}{{- end -}}' + - alert: PostfixRejection + # As a test, alert when there is any rejection. If too much alert we will raise the threshold + expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0 + for: "1m" + labels: + severity: warning + annotations: + summary: Mail rejection + description: At least one mail sent from {{ $labels.instance }} have been rejected + dashboard: https://grafana.picasoft.net/d/VB5CUrn7k/postfix?var-instance={{ $labels.instance }}