commit e1a42593a08bbe6e8e3dd7e491064fd7cd48fef7 parent 9301a4c24f71a507469c364434fc52b3c910081a Author: Christian Grothoff <christian@grothoff.org> Date: Tue, 3 Jun 2025 01:07:41 +0200 tighten alert rules Diffstat:
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/roles/monitoring/files/etc/prometheus/alert_rules.yml b/roles/monitoring/files/etc/prometheus/alert_rules.yml @@ -11,13 +11,13 @@ groups: description: "CPU latency is above 80% for more than 1 minute." - alert: LowDiskSpace - expr: (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 < 10 + expr: (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 < 50 for: 1m labels: severity: critical annotations: summary: "Low Disk Space detected" - description: "Disk space is below 10% for more than 1 minute." + description: "Disk space is below 50% for more than 1 minute." - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 diff --git a/roles/monitoring/files/etc/prometheus/node-exporter-rules.yml b/roles/monitoring/files/etc/prometheus/node-exporter-rules.yml @@ -5,13 +5,13 @@ groups: rules: - alert: HostOutOfMemory - expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)' + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .20)' for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) - description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Node memory is filling up (< 20% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryUnderMemoryPressure expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' @@ -59,13 +59,13 @@ groups: description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfDiskSpace - expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' + expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .50 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' for: 2m labels: severity: critical annotations: summary: Host out of disk space (instance {{ $labels.instance }}) - description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk is almost full (< 50% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostDiskMayFillIn24Hours expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0' @@ -77,13 +77,13 @@ groups: description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfInodes - expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' + expr: '(node_filesystem_files_free / node_filesystem_files < .50 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' for: 2m labels: severity: critical annotations: summary: Host out of inodes (instance {{ $labels.instance }}) - description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk is almost running out of available inodes (< 50% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostFilesystemDeviceError expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' diff --git a/roles/monitoring/templates/etc/prometheus/alertmanager.yml b/roles/monitoring/templates/etc/prometheus/alertmanager.yml @@ -3,7 +3,7 @@ global: # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost: 'firefly.gnunet.org' + smtp_smarthost: 'firefly.gnunet.org:25' smtp_from: 'alertmanager@taler.net' smtp_require_tls: false #smtp_auth_username: 'alertmanager'