Skip to content
Snippets Groups Projects
Verified Commit d713b056 authored by Quentin Duchemin's avatar Quentin Duchemin
Browse files

Add Alertmanager to send basic alerts to Mattermost

parent 016af2f5
No related branches found
No related tags found
1 merge request!65Add alerting
ARG VERSION=v0.22.2
FROM prom/alertmanager:${VERSION}
COPY ./entrypoint.sh /entrypoint.sh
# Initial image uses user nobody which cannot chmod nor sed
USER root
RUN chmod +x /entrypoint.sh
ENTRYPOINT [ "/entrypoint.sh" ]
CMD [ "/bin/alertmanager", "--config.file=/config/alertmanager-with-secrets.yml", "--storage.path=/alertmanager" ]
global:
slack_api_url: '$MATTERMOST_WEBHOOK'
# The root route on which each incoming alert enters.
route:
# The root route must not have any matchers as it is the entry point for
# all alerts. It needs to have a receiver configured so alerts that do not
# match any of the sub-routes are sent to someone.
receiver: 'mattermost'
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts alertname=LatencyHigh would be batched into a single group.
group_by: ['instance', 'alertname']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 24h
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: ['alertname']
receivers:
- name: 'mattermost'
slack_configs:
- channel: '$MATTERMOST_CHANNEL'
#!/bin/sh
if [ -z "${MATTERMOST_WEBHOOK}" ]; then
echo "MATTERMOST_WEBHOOK is mandatory, please provide it!"
fi
if [ -z "${MATTERMOST_CHANNEL}" ]; then
echo "MATTERMOST_CHANNEL is mandatory, please provide it!"
fi
# We use a busybox image, no way to use envsubst and Prometheus team had
# a long debate about whether env variable should be used for configuration,
# and they voted no. See https://github.com/prometheus/prometheus/issues/2357 for exemple.
# We have no other trivial way if we want to commit the configuration file without secrets inside.
cp /config/alertmanager.yml /config/alertmanager-with-secrets.yml
sed -i "s@\$MATTERMOST_WEBHOOK@${MATTERMOST_WEBHOOK}@g" /config/alertmanager-with-secrets.yml
sed -i "s@\$MATTERMOST_CHANNEL@${MATTERMOST_CHANNEL}@g" /config/alertmanager-with-secrets.yml
# Substitue shell with `command` in Dockerfile
exec $@
......@@ -13,6 +13,8 @@ volumes:
name: victoria-metrics
vmagent-buffer:
name: vmagent-buffer
alertmanager:
name: alertmanager
services:
grafana:
......@@ -97,12 +99,14 @@ services:
# Receives alerts and decides what to do, e.g. send a mail or a Mattermost message
# Takes care of deduplication etc
alertmanager:
image: prom/alertmanager:v0.22.2
image: registry.picasoft.net/pica-alertmanager:v0.22.2
build: ./alertmanager
container_name: alertmanager
command:
- "--config.file=/config/alertmanager.yml"
volumes:
- ./alertmanager.yml:/config/alertmanager.yml
- ./alertmanager/alertmanager.yml:/config/alertmanager.yml
# Unnamed volume declared in original Dockerfile
- alertmanager:/alertmanager
env_file: ./secrets/alertmanager.secrets
networks:
- metrics
restart: unless-stopped
......
# See https://team.picasoft.net/picasoft/integrations/incoming_webhooks
MATTERMOST_WEBHOOK=https://team.picasoft.net/hooks/<key>
# Use the channel key in its URL, team-technique is a good default
MATTERMOST_CHANNEL=team-technique
......@@ -28,16 +28,6 @@ groups:
annotations:
summary: Disk 95% full on {{ $labels.instance }}
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} is {{ $value }}% full
- alert: DiskFullSoon
# Based on the diminution of filesystem available space and a simple linear regression,
# check if the available space could reach 0 within 24 hours at the same rate (e.g. mass upload of files)
expr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0
for: "30m"
labels:
severity: warning
annotations:
summary: Disk expected to be full soon
description: Device {{ $labels.device }} mounted on {{ $labels.mountpoint }} could be full in within 24 hours
- alert: VMBackupFull
# Backup storage is always called "save" at Picasoft.
# pve_storage_info is always 1, and the multiplication allow to get `storage` label in the resulting vector
......@@ -71,7 +61,7 @@ groups:
# Only get values from real disks so ignore VMs
# This is hardcoded but I cannot see other way to do so because VMs do no have a specific prefix
# We must add new machines here
expr: smartmon_device_smart_healthy{node=~"alice|bob"} != 1
expr: smartmon_device_smart_healthy{instance=~"alice|bob"} != 1
labels:
severity: critical
annotations:
......@@ -231,3 +221,12 @@ groups:
severity: critical
annotations:
summary: "{{ $labels.instance }} is down"
- alert: PostfixRejection
# As a test, alert when there is any rejection. If too much alert we will raise the threshold
expr: rate(postfix_smtpd_messages_rejected_total[5m]) > 0
for: "1m"
labels:
severity: warning
annotations:
summary: Mail rejection
description: At least one mail sent from {{ $labels.instance }} have been rejected
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment