node-exporter-rules.yml - ansible-taler-exchange - Ansible playbook to deploy a production Taler Exchange

node-exporter-rules.yml (14664B)
      1 groups:
      2 
      3 - name: NodeExporter
      4 
      5   rules:
      6 
      7     - alert: HostOutOfMemory
      8       expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .20)'
      9       for: 2m
     10       labels:
     11         severity: warning
     12       annotations:
     13         summary: Host out of memory (instance {{ $labels.instance }})
     14         description: "Node memory is filling up (< 20% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     15 
     16     - alert: HostMemoryUnderMemoryPressure
     17       expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
     18       for: 0m
     19       labels:
     20         severity: warning
     21       annotations:
     22         summary: Host memory under memory pressure (instance {{ $labels.instance }})
     23         description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     24 
     25     - alert: HostUnusualNetworkThroughputIn
     26       expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
     27       for: 0m
     28       labels:
     29         severity: warning
     30       annotations:
     31         summary: Host unusual network throughput in (instance {{ $labels.instance }})
     32         description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     33 
     34     - alert: HostUnusualNetworkThroughputOut
     35       expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
     36       for: 0m
     37       labels:
     38         severity: warning
     39       annotations:
     40         summary: Host unusual network throughput out (instance {{ $labels.instance }})
     41         description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     42 
     43     - alert: HostUnusualDiskReadRate
     44       expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
     45       for: 0m
     46       labels:
     47         severity: warning
     48       annotations:
     49         summary: Host unusual disk read rate (instance {{ $labels.instance }})
     50         description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     51 
     52     - alert: HostOutOfDiskSpace
     53       expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .50 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
     54       for: 2m
     55       labels:
     56         severity: critical
     57       annotations:
     58         summary: Host out of disk space (instance {{ $labels.instance }})
     59         description: "Disk is almost full (< 50% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     60 
     61     - alert: HostDiskMayFillIn24Hours
     62       expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
     63       for: 2m
     64       labels:
     65         severity: warning
     66       annotations:
     67         summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
     68         description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     69 
     70     - alert: HostOutOfInodes
     71       expr: '(node_filesystem_files_free / node_filesystem_files < .50 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
     72       for: 2m
     73       labels:
     74         severity: critical
     75       annotations:
     76         summary: Host out of inodes (instance {{ $labels.instance }})
     77         description: "Disk is almost running out of available inodes (< 50% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     78 
     79     - alert: HostFilesystemDeviceError
     80       expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
     81       for: 2m
     82       labels:
     83         severity: critical
     84       annotations:
     85         summary: Host filesystem device error (instance {{ $labels.instance }})
     86         description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     87 
     88     - alert: HostInodesMayFillIn24Hours
     89       expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
     90       for: 2m
     91       labels:
     92         severity: warning
     93       annotations:
     94         summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
     95         description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
     96 
     97     - alert: HostUnusualDiskReadLatency
     98       expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
     99       for: 2m
    100       labels:
    101         severity: warning
    102       annotations:
    103         summary: Host unusual disk read latency (instance {{ $labels.instance }})
    104         description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    105 
    106     - alert: HostUnusualDiskWriteLatency
    107       expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
    108       for: 2m
    109       labels:
    110         severity: warning
    111       annotations:
    112         summary: Host unusual disk write latency (instance {{ $labels.instance }})
    113         description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    114 
    115     - alert: HostHighCpuLoad
    116       expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
    117       for: 10m
    118       labels:
    119         severity: warning
    120       annotations:
    121         summary: Host high CPU load (instance {{ $labels.instance }})
    122         description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    123 
    124     - alert: HostCpuStealNoisyNeighbor
    125       expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
    126       for: 0m
    127       labels:
    128         severity: warning
    129       annotations:
    130         summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
    131         description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    132 
    133     - alert: HostCpuHighIowait
    134       expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
    135       for: 0m
    136       labels:
    137         severity: warning
    138       annotations:
    139         summary: Host CPU high iowait (instance {{ $labels.instance }})
    140         description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    141 
    142     - alert: HostUnusualDiskIo
    143       expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
    144       for: 5m
    145       labels:
    146         severity: warning
    147       annotations:
    148         summary: Host unusual disk IO (instance {{ $labels.instance }})
    149         description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    150 
    151     - alert: HostContextSwitchingHigh
    152       expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
    153       for: 0m
    154       labels:
    155         severity: warning
    156       annotations:
    157         summary: Host context switching high (instance {{ $labels.instance }})
    158         description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    159 
    160     - alert: HostSwapIsFillingUp
    161       expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
    162       for: 2m
    163       labels:
    164         severity: warning
    165       annotations:
    166         summary: Host swap is filling up (instance {{ $labels.instance }})
    167         description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    168 
    169     - alert: HostSystemdServiceCrashed
    170       expr: '(node_systemd_unit_state{state="failed"} == 1)'
    171       for: 0m
    172       labels:
    173         severity: warning
    174       annotations:
    175         summary: Host systemd service crashed (instance {{ $labels.instance }})
    176         description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    177 
    178     - alert: HostPhysicalComponentTooHot
    179       expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
    180       for: 5m
    181       labels:
    182         severity: warning
    183       annotations:
    184         summary: Host physical component too hot (instance {{ $labels.instance }})
    185         description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    186 
    187     - alert: HostNodeOvertemperatureAlarm
    188       expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
    189       for: 0m
    190       labels:
    191         severity: critical
    192       annotations:
    193         summary: Host node overtemperature alarm (instance {{ $labels.instance }})
    194         description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    195 
    196     - alert: HostSoftwareRaidInsufficientDrives
    197       expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
    198       for: 0m
    199       labels:
    200         severity: critical
    201       annotations:
    202         summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
    203         description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    204 
    205     - alert: HostSoftwareRaidDiskFailure
    206       expr: '(node_md_disks{state="failed"} > 0)'
    207       for: 2m
    208       labels:
    209         severity: warning
    210       annotations:
    211         summary: Host software RAID disk failure (instance {{ $labels.instance }})
    212         description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    213 
    214     - alert: HostKernelVersionDeviations
    215       expr: 'changes(node_uname_info[1h]) > 0'
    216       for: 0m
    217       labels:
    218         severity: info
    219       annotations:
    220         summary: Host kernel version deviations (instance {{ $labels.instance }})
    221         description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    222 
    223     - alert: HostOomKillDetected
    224       expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
    225       for: 0m
    226       labels:
    227         severity: warning
    228       annotations:
    229         summary: Host OOM kill detected (instance {{ $labels.instance }})
    230         description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    231 
    232     - alert: HostEdacCorrectableErrorsDetected
    233       expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
    234       for: 0m
    235       labels:
    236         severity: info
    237       annotations:
    238         summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
    239         description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    240 
    241     - alert: HostEdacUncorrectableErrorsDetected
    242       expr: '(node_edac_uncorrectable_errors_total > 0)'
    243       for: 0m
    244       labels:
    245         severity: warning
    246       annotations:
    247         summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
    248         description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    249 
    250     - alert: HostNetworkReceiveErrors
    251       expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
    252       for: 2m
    253       labels:
    254         severity: warning
    255       annotations:
    256         summary: Host Network Receive Errors (instance {{ $labels.instance }})
    257         description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    258 
    259     - alert: HostNetworkTransmitErrors
    260       expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
    261       for: 2m
    262       labels:
    263         severity: warning
    264       annotations:
    265         summary: Host Network Transmit Errors (instance {{ $labels.instance }})
    266         description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    267 
    268     - alert: HostNetworkBondDegraded
    269       expr: '((node_bonding_active - node_bonding_slaves) != 0)'
    270       for: 2m
    271       labels:
    272         severity: warning
    273       annotations:
    274         summary: Host Network Bond Degraded (instance {{ $labels.instance }})
    275         description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    276 
    277     - alert: HostConntrackLimit
    278       expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
    279       for: 5m
    280       labels:
    281         severity: warning
    282       annotations:
    283         summary: Host conntrack limit (instance {{ $labels.instance }})
    284         description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    285 
    286     - alert: HostClockSkew
    287       expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
    288       for: 10m
    289       labels:
    290         severity: warning
    291       annotations:
    292         summary: Host clock skew (instance {{ $labels.instance }})
    293         description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    294 
    295     - alert: HostClockNotSynchronising
    296       expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
    297       for: 2m
    298       labels:
    299         severity: warning
    300       annotations:
    301         summary: Host clock not synchronising (instance {{ $labels.instance }})
    302         description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    303 
    304     - alert: HostRequiresReboot
    305       expr: '(node_reboot_required > 0)'
    306       for: 4h
    307       labels:
    308         severity: info
    309       annotations:
    310         summary: Host requires reboot (instance {{ $labels.instance }})
    311         description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
	ansible-taler-exchange Ansible playbook to deploy a production Taler Exchange
	Log \| Files \| Refs \| Submodules \| README \| LICENSE