Add Alertmanager to send basic alerts to Mattermost

d713b056 · Quentin Duchemin · 016af2f5 · 016af2f5 · d713b056 · d713b056
Verified Commit d713b056 authored 3 years ago by Quentin Duchemin
--- a/pica-metrologie/alertmanager.yml
+++ b/pica-metrologie/alertmanager.yml
--- a/pica-metrologie/alertmanager/Dockerfile
+++ b/pica-metrologie/alertmanager/Dockerfile
+ARG VERSION=v0.22.2
+
+FROM prom/alertmanager:${VERSION}
+
+COPY ./entrypoint.sh /entrypoint.sh
+# Initial image uses user nobody which cannot chmod nor sed
+USER root
+RUN chmod +x /entrypoint.sh
+
+ENTRYPOINT [ "/entrypoint.sh" ]
+CMD [ "/bin/alertmanager", "--config.file=/config/alertmanager-with-secrets.yml", "--storage.path=/alertmanager" ]
--- a/pica-metrologie/alertmanager/alertmanager.yml
+++ b/pica-metrologie/alertmanager/alertmanager.yml
+global:
+  slack_api_url: '$MATTERMOST_WEBHOOK'
+
+# The root route on which each incoming alert enters.
+route:
+  # The root route must not have any matchers as it is the entry point for
+  # all alerts. It needs to have a receiver configured so alerts that do not
+  # match any of the sub-routes are sent to someone.
+  receiver: 'mattermost'
+
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts alertname=LatencyHigh would be batched into a single group.
+  group_by: ['instance', 'alertname']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 24h
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+- source_matchers:
+    - severity="critical"
+  target_matchers:
+    - severity="warning"
+  # Apply inhibition if the alertname is the same.
+  # CAUTION:
+  #   If all label names listed in `equal` are missing
+  #   from both the source and target alerts,
+  #   the inhibition rule will apply!
+  equal: ['alertname']
+
+receivers:
+- name: 'mattermost'
+  slack_configs:
+    - channel: '$MATTERMOST_CHANNEL'
--- a/pica-metrologie/alertmanager/entrypoint.sh
+++ b/pica-metrologie/alertmanager/entrypoint.sh
+#!/bin/sh
+
+if [ -z "${MATTERMOST_WEBHOOK}" ]; then
+  echo "MATTERMOST_WEBHOOK is mandatory, please provide it!"
+fi
+
+if [ -z "${MATTERMOST_CHANNEL}" ]; then
+  echo "MATTERMOST_CHANNEL is mandatory, please provide it!"
+fi
+
+# We use a busybox image, no way to use envsubst and Prometheus team had
+# a long debate about whether env variable should be used for configuration,
+# and they voted no. See https://github.com/prometheus/prometheus/issues/2357 for exemple.
+# We have no other trivial way if we want to commit the configuration file without secrets inside.
+cp /config/alertmanager.yml /config/alertmanager-with-secrets.yml
+sed -i "s@\$MATTERMOST_WEBHOOK@${MATTERMOST_WEBHOOK}@g" /config/alertmanager-with-secrets.yml
+sed -i "s@\$MATTERMOST_CHANNEL@${MATTERMOST_CHANNEL}@g" /config/alertmanager-with-secrets.yml
+
+# Substitue shell with `command` in Dockerfile
+exec $@
--- a/pica-metrologie/docker-compose.yml
+++ b/pica-metrologie/docker-compose.yml
@@ -13,6 +13,8 @@ volumes:
    name: victoria-metrics
  vmagent-buffer:
    name: vmagent-buffer
+  alertmanager:
+    name: alertmanager

 services:
  grafana:
@@ -97,12 +99,14 @@ services:
  # Receives alerts and decides what to do, e.g. send a mail or a Mattermost message
  # Takes care of deduplication etc
  alertmanager:
-    image: prom/alertmanager:v0.22.2
+    image: registry.picasoft.net/pica-alertmanager:v0.22.2
+    build: ./alertmanager
    container_name: alertmanager
-    command:
-      - "--config.file=/config/alertmanager.yml"
    volumes:
-      - ./alertmanager.yml:/config/alertmanager.yml
+      - ./alertmanager/alertmanager.yml:/config/alertmanager.yml
+      # Unnamed volume declared in original Dockerfile
+      - alertmanager:/alertmanager
+    env_file: ./secrets/alertmanager.secrets
    networks:
      - metrics
    restart: unless-stopped

--- a/pica-metrologie/secrets/alertmanager.secrets.example
+++ b/pica-metrologie/secrets/alertmanager.secrets.example
+# See https://team.picasoft.net/picasoft/integrations/incoming_webhooks
+MATTERMOST_WEBHOOK=https://team.picasoft.net/hooks/<key>
+# Use the channel key in its URL, team-technique is a good default
+MATTERMOST_CHANNEL=team-technique
--- a/pica-metrologie/vmalert-rules.yml
+++ b/pica-metrologie/vmalert-rules.yml
@@ -28,16 +28,6 @@ groups:
    annotations:
      summary: Disk 95% full on {{ $labels.instance }}
      description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
-  - alert: DiskFullSoon
-    # Based on the diminution of filesystem available space and a simple linear regression,
-    # check if the available space could reach 0 within 24 hours at the same rate (e.g. mass upload of files)
-    expr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0
-    for: "30m"
-    labels:
-      severity: warning
-    annotations:
-      summary: Disk expected to be full soon
-      description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} could be full in within 24 hours
  - alert: VMBackupFull
    # Backup storage is always called "save" at Picasoft.
    # pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
@@ -71,7 +61,7 @@ groups:
    # Only get values from real disks so ignore VMs
    # This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
    # We must add new machines here
-    expr: smartmon_device_smart_healthy{node=~"alice|bob"} != 1
+    expr: smartmon_device_smart_healthy{instance=~"alice|bob"} != 1
    labels:
      severity: critical
    annotations:
@@ -231,3 +221,12 @@ groups:
      severity: critical
    annotations:
      summary: "{{ $labels.instance }} is down"
+  - alert: PostfixRejection
+    # As a test, alert when there is any rejection. If too much alert we will raise the threshold
+    expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0
+    for: "1m"
+    labels:
+      severity: warning
+    annotations:
+      summary: Mail rejection
+      description: At least one mail sent from {{ $labels.instance }} have been rejected