ansible-taler-exchange

Ansible playbook to deploy a production Taler Exchange
Log | Files | Refs | Submodules | README | LICENSE

commit 613cd9be4a8a28539f754a1f83a13f204e26cfac
parent 65cd6b80032190843960fa3464c399e749b7e148
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon,  2 Jun 2025 09:14:36 +0200

use firefly for alerts

Diffstat:
Aroles/monitoring/files/etc/prometheus/alert_rules.yml | 29+++++++++++++++++++++++++++++
Mroles/monitoring/files/etc/prometheus/prometheus.yml | 8++++----
Mroles/monitoring/tasks/main.yml | 8++++++++
Mroles/monitoring/templates/etc/prometheus/alertmanager.yml | 12++++++------
4 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/roles/monitoring/files/etc/prometheus/alert_rules.yml b/roles/monitoring/files/etc/prometheus/alert_rules.yml @@ -0,0 +1,29 @@ +groups: +- name: node_exporter_alerts + rules: + - alert: HighCPULatency + expr: sum(rate(node_cpu_seconds_total{mode="system"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 80 + for: 1m + labels: + severity: warning + annotations: + summary: "High CPU Latency detected" + description: "CPU latency is above 80% for more than 1 minute." + + - alert: LowDiskSpace + expr: (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 < 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Low Disk Space detected" + description: "Disk space is below 10% for more than 1 minute." + + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 + for: 1m + labels: + severity: warning + annotations: + summary: "High Memory Usage detected" + description: "Memory usage is above 80% for more than 1 minute." diff --git a/roles/monitoring/files/etc/prometheus/prometheus.yml b/roles/monitoring/files/etc/prometheus/prometheus.yml @@ -1,7 +1,7 @@ # my global config global: - scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. - evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration -- FIXME: not yet setup! @@ -31,13 +31,13 @@ scrape_configs: # Job, for local node exporter - job_name: 'node_exporter_metrics' - scrape_interval: 5s + scrape_interval: 60s static_configs: - targets: ['localhost:9100'] # Job, for local nginx exporter - job_name: 'nginx_exporter_metrics' - scrape_interval: 5s + scrape_interval: 60s static_configs: - targets: ['localhost:9113'] diff --git a/roles/monitoring/tasks/main.yml b/roles/monitoring/tasks/main.yml @@ -200,6 +200,14 @@ group: root mode: "0644" +- name: Configure node-exporter rules for alertmanager + copy: + src: etc/prometheus/alert_rules.yml + dest: /etc/prometheus/alert_rules.yml + owner: root + group: root + mode: "0644" + - name: Ensure exporter services are enabled and started service: name: "{{ item }}" diff --git a/roles/monitoring/templates/etc/prometheus/alertmanager.yml b/roles/monitoring/templates/etc/prometheus/alertmanager.yml @@ -3,14 +3,14 @@ global: # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost: 'localhost:25' + smtp_smarthost: 'firefly.gnunet.org' smtp_from: 'alertmanager@taler.net' smtp_require_tls: false #smtp_auth_username: 'alertmanager' #smtp_auth_password: 'password' # The directory from which notification templates are read. -templates: +templates: - '/etc/prometheus/alertmanager_templates/*.tmpl' # The root route on which each incoming alert enters. @@ -23,7 +23,7 @@ route: # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start - # firing shortly after another are batched together on the first + # firing shortly after another are batched together on the first # notification. group_wait: 30s @@ -33,12 +33,12 @@ route: # If an alert has successfully been sent, wait 'repeat_interval' to # resend them. - repeat_interval: 12h + repeat_interval: 12h # A default receiver receiver: taler-warning-mails - # All the above attributes are inherited by all child routes and can + # All the above attributes are inherited by all child routes and can # overwritten on each. # The child route trees. @@ -50,7 +50,7 @@ route: # Inhibition rules allow to mute a set of alerts given that another alert is # firing. -# We use this to mute any warning-level notifications if the same alert is +# We use this to mute any warning-level notifications if the same alert is # already critical. inhibit_rules: - source_match: