use firefly for alerts - ansible-taler-exchange - Ansible playbook to deploy a production Taler Exchange

commit 613cd9be4a8a28539f754a1f83a13f204e26cfac
parent 65cd6b80032190843960fa3464c399e749b7e148
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon,  2 Jun 2025 09:14:36 +0200

use firefly for alerts

Diffstat:
A roles/monitoring/files/etc/prometheus/alert_rules.yml  | 29 +++++++++++++++++++++++++++++
M roles/monitoring/files/etc/prometheus/prometheus.yml  | 8 ++++----
M roles/monitoring/tasks/main.yml  | 8 ++++++++
M roles/monitoring/templates/etc/prometheus/alertmanager.yml  | 12 ++++++------

4 files changed, 47 insertions(+), 10 deletions(-)
diff --git a/roles/monitoring/files/etc/prometheus/alert_rules.yml b/roles/monitoring/files/etc/prometheus/alert_rules.yml
@@ -0,0 +1,29 @@
+groups:
+- name: node_exporter_alerts
+  rules:
+  - alert: HighCPULatency
+    expr: sum(rate(node_cpu_seconds_total{mode="system"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 80
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "High CPU Latency detected"
+      description: "CPU latency is above 80% for more than 1 minute."
+
+  - alert: LowDiskSpace
+    expr: (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 < 10
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Low Disk Space detected"
+      description: "Disk space is below 10% for more than 1 minute."
+
+  - alert: HighMemoryUsage
+    expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "High Memory Usage detected"
+      description: "Memory usage is above 80% for more than 1 minute."
diff --git a/roles/monitoring/files/etc/prometheus/prometheus.yml b/roles/monitoring/files/etc/prometheus/prometheus.yml
@@ -1,7 +1,7 @@
 # my global config
 global:
-  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
-  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
   # scrape_timeout is set to the global default (10s).
 
 # Alertmanager configuration -- FIXME: not yet setup!
@@ -31,13 +31,13 @@ scrape_configs:
 
   # Job, for local node exporter
   - job_name: 'node_exporter_metrics'
-    scrape_interval: 5s
+    scrape_interval: 60s
     static_configs:
       - targets: ['localhost:9100']
 
   # Job, for local nginx exporter
   - job_name: 'nginx_exporter_metrics'
-    scrape_interval: 5s
+    scrape_interval: 60s
     static_configs:
       - targets: ['localhost:9113']
 
diff --git a/roles/monitoring/tasks/main.yml b/roles/monitoring/tasks/main.yml
@@ -200,6 +200,14 @@
     group: root
     mode: "0644"
 
+- name: Configure node-exporter rules for alertmanager
+  copy:
+    src: etc/prometheus/alert_rules.yml
+    dest: /etc/prometheus/alert_rules.yml
+    owner: root
+    group: root
+    mode: "0644"
+
 - name: Ensure exporter services are enabled and started
   service:
     name: "{{ item }}"
diff --git a/roles/monitoring/templates/etc/prometheus/alertmanager.yml b/roles/monitoring/templates/etc/prometheus/alertmanager.yml
@@ -3,14 +3,14 @@
 
 global:
   # The smarthost and SMTP sender used for mail notifications.
-  smtp_smarthost: 'localhost:25'
+  smtp_smarthost: 'firefly.gnunet.org'
   smtp_from: 'alertmanager@taler.net'
   smtp_require_tls: false
     #smtp_auth_username: 'alertmanager'
     #smtp_auth_password: 'password'
 
 # The directory from which notification templates are read.
-templates: 
+templates:
 - '/etc/prometheus/alertmanager_templates/*.tmpl'
 
 # The root route on which each incoming alert enters.
@@ -23,7 +23,7 @@ route:
   # When a new group of alerts is created by an incoming alert, wait at
   # least 'group_wait' to send the initial notification.
   # This way ensures that you get multiple alerts for the same group that start
-  # firing shortly after another are batched together on the first 
+  # firing shortly after another are batched together on the first
   # notification.
   group_wait: 30s
 
@@ -33,12 +33,12 @@ route:
 
   # If an alert has successfully been sent, wait 'repeat_interval' to
   # resend them.
-  repeat_interval: 12h 
+  repeat_interval: 12h
 
   # A default receiver
   receiver: taler-warning-mails
 
-  # All the above attributes are inherited by all child routes and can 
+  # All the above attributes are inherited by all child routes and can
   # overwritten on each.
 
   # The child route trees.
@@ -50,7 +50,7 @@ route:
 
 # Inhibition rules allow to mute a set of alerts given that another alert is
 # firing.
-# We use this to mute any warning-level notifications if the same alert is 
+# We use this to mute any warning-level notifications if the same alert is
 # already critical.
 inhibit_rules:
 - source_match:

	ansible-taler-exchange Ansible playbook to deploy a production Taler Exchange
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

A	roles/monitoring/files/etc/prometheus/alert_rules.yml	\|	29	+++++++++++++++++++++++++++++
M	roles/monitoring/files/etc/prometheus/prometheus.yml	\|	8	++++----
M	roles/monitoring/tasks/main.yml	\|	8	++++++++
M	roles/monitoring/templates/etc/prometheus/alertmanager.yml	\|	12	++++++------