node-exporter-rules.yml (14664B)
1 groups: 2 3 - name: NodeExporter 4 5 rules: 6 7 - alert: HostOutOfMemory 8 expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .20)' 9 for: 2m 10 labels: 11 severity: warning 12 annotations: 13 summary: Host out of memory (instance {{ $labels.instance }}) 14 description: "Node memory is filling up (< 20% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 15 16 - alert: HostMemoryUnderMemoryPressure 17 expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' 18 for: 0m 19 labels: 20 severity: warning 21 annotations: 22 summary: Host memory under memory pressure (instance {{ $labels.instance }}) 23 description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 24 25 - alert: HostUnusualNetworkThroughputIn 26 expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' 27 for: 0m 28 labels: 29 severity: warning 30 annotations: 31 summary: Host unusual network throughput in (instance {{ $labels.instance }}) 32 description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 33 34 - alert: HostUnusualNetworkThroughputOut 35 expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' 36 for: 0m 37 labels: 38 severity: warning 39 annotations: 40 summary: Host unusual network throughput out (instance {{ $labels.instance }}) 41 description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 42 43 - alert: HostUnusualDiskReadRate 44 expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)' 45 for: 0m 46 labels: 47 severity: warning 48 annotations: 49 summary: Host unusual disk read rate (instance {{ $labels.instance }}) 50 description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 51 52 - alert: HostOutOfDiskSpace 53 expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .50 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' 54 for: 2m 55 labels: 56 severity: critical 57 annotations: 58 summary: Host out of disk space (instance {{ $labels.instance }}) 59 description: "Disk is almost full (< 50% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 60 61 - alert: HostDiskMayFillIn24Hours 62 expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0' 63 for: 2m 64 labels: 65 severity: warning 66 annotations: 67 summary: Host disk may fill in 24 hours (instance {{ $labels.instance }}) 68 description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 69 70 - alert: HostOutOfInodes 71 expr: '(node_filesystem_files_free / node_filesystem_files < .50 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' 72 for: 2m 73 labels: 74 severity: critical 75 annotations: 76 summary: Host out of inodes (instance {{ $labels.instance }}) 77 description: "Disk is almost running out of available inodes (< 50% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 78 79 - alert: HostFilesystemDeviceError 80 expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' 81 for: 2m 82 labels: 83 severity: critical 84 annotations: 85 summary: Host filesystem device error (instance {{ $labels.instance }}) 86 description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 87 88 - alert: HostInodesMayFillIn24Hours 89 expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' 90 for: 2m 91 labels: 92 severity: warning 93 annotations: 94 summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }}) 95 description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 96 97 - alert: HostUnusualDiskReadLatency 98 expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)' 99 for: 2m 100 labels: 101 severity: warning 102 annotations: 103 summary: Host unusual disk read latency (instance {{ $labels.instance }}) 104 description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 105 106 - alert: HostUnusualDiskWriteLatency 107 expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)' 108 for: 2m 109 labels: 110 severity: warning 111 annotations: 112 summary: Host unusual disk write latency (instance {{ $labels.instance }}) 113 description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 114 115 - alert: HostHighCpuLoad 116 expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80' 117 for: 10m 118 labels: 119 severity: warning 120 annotations: 121 summary: Host high CPU load (instance {{ $labels.instance }}) 122 description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 123 124 - alert: HostCpuStealNoisyNeighbor 125 expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' 126 for: 0m 127 labels: 128 severity: warning 129 annotations: 130 summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) 131 description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 132 133 - alert: HostCpuHighIowait 134 expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' 135 for: 0m 136 labels: 137 severity: warning 138 annotations: 139 summary: Host CPU high iowait (instance {{ $labels.instance }}) 140 description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 141 142 - alert: HostUnusualDiskIo 143 expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8' 144 for: 5m 145 labels: 146 severity: warning 147 annotations: 148 summary: Host unusual disk IO (instance {{ $labels.instance }}) 149 description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 150 151 - alert: HostContextSwitchingHigh 152 expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' 153 for: 0m 154 labels: 155 severity: warning 156 annotations: 157 summary: Host context switching high (instance {{ $labels.instance }}) 158 description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 159 160 - alert: HostSwapIsFillingUp 161 expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)' 162 for: 2m 163 labels: 164 severity: warning 165 annotations: 166 summary: Host swap is filling up (instance {{ $labels.instance }}) 167 description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 168 169 - alert: HostSystemdServiceCrashed 170 expr: '(node_systemd_unit_state{state="failed"} == 1)' 171 for: 0m 172 labels: 173 severity: warning 174 annotations: 175 summary: Host systemd service crashed (instance {{ $labels.instance }}) 176 description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 177 178 - alert: HostPhysicalComponentTooHot 179 expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius' 180 for: 5m 181 labels: 182 severity: warning 183 annotations: 184 summary: Host physical component too hot (instance {{ $labels.instance }}) 185 description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 186 187 - alert: HostNodeOvertemperatureAlarm 188 expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))' 189 for: 0m 190 labels: 191 severity: critical 192 annotations: 193 summary: Host node overtemperature alarm (instance {{ $labels.instance }}) 194 description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 195 196 - alert: HostSoftwareRaidInsufficientDrives 197 expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)' 198 for: 0m 199 labels: 200 severity: critical 201 annotations: 202 summary: Host software RAID insufficient drives (instance {{ $labels.instance }}) 203 description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 204 205 - alert: HostSoftwareRaidDiskFailure 206 expr: '(node_md_disks{state="failed"} > 0)' 207 for: 2m 208 labels: 209 severity: warning 210 annotations: 211 summary: Host software RAID disk failure (instance {{ $labels.instance }}) 212 description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 213 214 - alert: HostKernelVersionDeviations 215 expr: 'changes(node_uname_info[1h]) > 0' 216 for: 0m 217 labels: 218 severity: info 219 annotations: 220 summary: Host kernel version deviations (instance {{ $labels.instance }}) 221 description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 222 223 - alert: HostOomKillDetected 224 expr: '(increase(node_vmstat_oom_kill[1m]) > 0)' 225 for: 0m 226 labels: 227 severity: warning 228 annotations: 229 summary: Host OOM kill detected (instance {{ $labels.instance }}) 230 description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 231 232 - alert: HostEdacCorrectableErrorsDetected 233 expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)' 234 for: 0m 235 labels: 236 severity: info 237 annotations: 238 summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) 239 description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 240 241 - alert: HostEdacUncorrectableErrorsDetected 242 expr: '(node_edac_uncorrectable_errors_total > 0)' 243 for: 0m 244 labels: 245 severity: warning 246 annotations: 247 summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) 248 description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 249 250 - alert: HostNetworkReceiveErrors 251 expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)' 252 for: 2m 253 labels: 254 severity: warning 255 annotations: 256 summary: Host Network Receive Errors (instance {{ $labels.instance }}) 257 description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 258 259 - alert: HostNetworkTransmitErrors 260 expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)' 261 for: 2m 262 labels: 263 severity: warning 264 annotations: 265 summary: Host Network Transmit Errors (instance {{ $labels.instance }}) 266 description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 267 268 - alert: HostNetworkBondDegraded 269 expr: '((node_bonding_active - node_bonding_slaves) != 0)' 270 for: 2m 271 labels: 272 severity: warning 273 annotations: 274 summary: Host Network Bond Degraded (instance {{ $labels.instance }}) 275 description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 276 277 - alert: HostConntrackLimit 278 expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)' 279 for: 5m 280 labels: 281 severity: warning 282 annotations: 283 summary: Host conntrack limit (instance {{ $labels.instance }}) 284 description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 285 286 - alert: HostClockSkew 287 expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))' 288 for: 10m 289 labels: 290 severity: warning 291 annotations: 292 summary: Host clock skew (instance {{ $labels.instance }}) 293 description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 294 295 - alert: HostClockNotSynchronising 296 expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)' 297 for: 2m 298 labels: 299 severity: warning 300 annotations: 301 summary: Host clock not synchronising (instance {{ $labels.instance }}) 302 description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 303 304 - alert: HostRequiresReboot 305 expr: '(node_reboot_required > 0)' 306 for: 4h 307 labels: 308 severity: info 309 annotations: 310 summary: Host requires reboot (instance {{ $labels.instance }}) 311 description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"