Verified Commit d713b056 authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Add Alertmanager to send basic alerts to Mattermost

parent 016af2f5
ARG VERSION=v0.22.2
FROM prom/alertmanager:${VERSION}
COPY ./entrypoint.sh /entrypoint.sh
# Initial image uses user nobody which cannot chmod nor sed
USER root
RUN chmod +x /entrypoint.sh
ENTRYPOINT [ "/entrypoint.sh" ]
CMD [ "/bin/alertmanager", "--config.file=/config/alertmanager-with-secrets.yml", "--storage.path=/alertmanager" ]
global:
slack_api_url: '$MATTERMOST_WEBHOOK'
# The root route on which each incoming alert enters.
route:
# The root route must not have any matchers as it is the entry point for
# all alerts. It needs to have a receiver configured so alerts that do not
# match any of the sub-routes are sent to someone.
receiver: 'mattermost'
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts alertname=LatencyHigh would be batched into a single group.
group_by: ['instance', 'alertname']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 24h
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: ['alertname']
receivers:
- name: 'mattermost'
slack_configs:
- channel: '$MATTERMOST_CHANNEL'
#!/bin/sh
if [ -z "${MATTERMOST_WEBHOOK}" ]; then
echo "MATTERMOST_WEBHOOK is mandatory, please provide it!"
fi
if [ -z "${MATTERMOST_CHANNEL}" ]; then
echo "MATTERMOST_CHANNEL is mandatory, please provide it!"
fi
# We use a busybox image, no way to use envsubst and Prometheus team had
# a long debate about whether env variable should be used for configuration,
# and they voted no. See https://github.com/prometheus/prometheus/issues/2357 for exemple.
# We have no other trivial way if we want to commit the configuration file without secrets inside.
cp /config/alertmanager.yml /config/alertmanager-with-secrets.yml
sed -i "s@\$MATTERMOST_WEBHOOK@${MATTERMOST_WEBHOOK}@g" /config/alertmanager-with-secrets.yml
sed -i "s@\$MATTERMOST_CHANNEL@${MATTERMOST_CHANNEL}@g" /config/alertmanager-with-secrets.yml
# Substitue shell with `command` in Dockerfile
exec $@
......@@ -13,6 +13,8 @@ volumes:
name: victoria-metrics
vmagent-buffer:
name: vmagent-buffer
alertmanager:
name: alertmanager
services:
grafana:
......@@ -97,12 +99,14 @@ services:
# Receives alerts and decides what to do, e.g. send a mail or a Mattermost message
# Takes care of deduplication etc
alertmanager:
image: prom/alertmanager:v0.22.2
image: registry.picasoft.net/pica-alertmanager:v0.22.2
build: ./alertmanager
container_name: alertmanager
command:
- "--config.file=/config/alertmanager.yml"
volumes:
- ./alertmanager.yml:/config/alertmanager.yml
- ./alertmanager/alertmanager.yml:/config/alertmanager.yml
# Unnamed volume declared in original Dockerfile
- alertmanager:/alertmanager
env_file: ./secrets/alertmanager.secrets
networks:
- metrics
restart: unless-stopped
......
# See https://team.picasoft.net/picasoft/integrations/incoming_webhooks
MATTERMOST_WEBHOOK=https://team.picasoft.net/hooks/<key>
# Use the channel key in its URL, team-technique is a good default
MATTERMOST_CHANNEL=team-technique
......@@ -28,16 +28,6 @@ groups:
annotations:
summary: Disk 95% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
- alert: DiskFullSoon
# Based on the diminution of filesystem available space and a simple linear regression,
# check if the available space could reach 0 within 24 hours at the same rate (e.g. mass upload of files)
expr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0
for: "30m"
labels:
severity: warning
annotations:
summary: Disk expected to be full soon
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} could be full in within 24 hours
- alert: VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
......@@ -71,7 +61,7 @@ groups:
# Only get values from real disks so ignore VMs
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
# We must add new machines here
expr: smartmon_device_smart_healthy{node=~"alice|bob"} != 1
expr: smartmon_device_smart_healthy{instance=~"alice|bob"} != 1
labels:
severity: critical
annotations:
......@@ -231,3 +221,12 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.instance }} is down"
- alert: PostfixRejection
# As a test, alert when there is any rejection. If too much alert we will raise the threshold
expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0
for: "1m"
labels:
severity: warning
annotations:
summary: Mail rejection
description: At least one mail sent from {{ $labels.instance }} have been rejected
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment