fix up alloy deployment - ansible-taler-exchange - Ansible playbook to deploy a production Taler Exchange

commit 7d95abf0d91b57efd3b25fa1df97f9b9cb7a057a
parent 4334fc80d205bc20c17d71a986ff93858ef95568
Author: Christian Grothoff <christian@grothoff.org>
Date:   Thu, 30 Jan 2025 12:10:25 +0100

fix up alloy deployment

Diffstat:
M roles/monitoring/files/etc/default/prometheus  | 2 +-
M roles/monitoring/files/etc/prometheus/prometheus.yml  | 5 +++++
M roles/monitoring/templates/etc/alloy/config.alloy  | 280 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------

3 files changed, 210 insertions(+), 77 deletions(-)
diff --git a/roles/monitoring/files/etc/default/prometheus b/roles/monitoring/files/etc/default/prometheus
@@ -2,4 +2,4 @@
 # Due to shell escaping, to pass backslashes for regexes, you need to double
 # them (\\d for \d). If running under systemd, you need to double them again
 # (\\\\d to mean \d), and escape newlines too.
-ARGS="--web.listen-address=127.0.0.1:9090"
+ARGS="--web.listen-address=127.0.0.1:9090 --enable-feature=remote-write-receiver"
diff --git a/roles/monitoring/files/etc/prometheus/prometheus.yml b/roles/monitoring/files/etc/prometheus/prometheus.yml
@@ -49,3 +49,8 @@ scrape_configs:
   - job_name: 'process_exporter'
     static_configs:
       - targets: ['localhost:9256']
+
+  # Job, for Alloy
+  - job_name: 'alloy_exporter'
+    static_configs:
+      - targets: ['localhost:12345']
diff --git a/roles/monitoring/templates/etc/alloy/config.alloy b/roles/monitoring/templates/etc/alloy/config.alloy
@@ -1,44 +1,101 @@
-// Sample config for Alloy.
-//
 // For a full configuration reference, see https://grafana.com/docs/alloy
 logging {
   level = "warn"
 }
 
-// Which log files to monitor
+// Push the logs to loki
+// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
+loki.write "grafana_loki" {
+    endpoint {
+        url = "https://loki.taler-systems.com/loki/api/v1/push"
+        authorization {
+          type = "Bearer"
+          credentials = "{{ LOKI_ACCESS_TOKEN }}"
+        }
+    }
+}
+
+
+// Which log files to monitor: all regular log files with errors
 local.file_match "local_files" {
    path_targets = [
         {"__path__" = "/var/log/*.log"},
-        {"__path__" = "/var/log/nginx/*.err"},
    ]
    sync_period = "5s"
 }
 
-// Which log files to monitor
-local.file_match "http_logs" {
+
+// Connect local_files as source to filter_generic_logs
+// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
+loki.source.file "log_scrape" {
+    targets    = local.file_match.local_files.targets
+    forward_to = [loki.process.filter_generic_logs.receiver]
+    tail_from_end = true
+}
+
+// Which log files to monitor: all regular log files with errors
+local.file_match "nginx_errors" {
    path_targets = [
-        {"__path__" = "/var/log/nginx/*.log"},
+        {"__path__" = "/var/log/nginx/*.err"},
    ]
    sync_period = "5s"
 }
 
-// Connect local_files as source to filter_logs
+// Connect nginx_errors directly to loki
 // See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
-loki.source.file "log_scrape" {
-    targets    = local.file_match.local_files.targets
-    forward_to = [loki.process.filter_logs.receiver]
+loki.source.file "nginx_error_scrape" {
+    targets    = local.file_match.nginx_errors.targets
+    forward_to = [loki.write.grafana_loki.receiver]
     tail_from_end = true
 }
 
-loki.source.file "web_scrape" {
+
+// Which log files to monitor: nginx regular logs
+local.file_match "http_logs" {
+   path_targets = [
+        {"__path__" = "/var/log/nginx/*.log"},
+   ]
+   sync_period = "5s"
+}
+
+// Connect http_files as source to filter_http
+// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
+loki.source.file "http_scrape" {
     targets    = local.file_match.http_logs.targets
-    forward_to = [loki.process.filter_logs.receiver]
+    forward_to = [loki.process.filter_http.receiver]
     tail_from_end = true
 }
 
-// Filter the logs
+// Filter the HTTP logs
 // See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
-loki.process "filter_logs" {
+loki.process "filter_http" {
+
+     // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stageregex-block
+    stage.regex {
+      expression  = "(?P<ip>\\S+) (?P<identd>\\S+) (?P<user>\\S+) \\[(?P<timestamp>[\\w:\\/]+\\s[+\\\\-]\\d{4})\\] \"(?P<action>\\S+)\\s?(?P<path>\\S+)\\s?(?P<protocol>\\S+)?\" (?P<status>\\d{3}|-) (?P<size>\\d+|-)\\s?\"?(?P<referrer>[^\\\"]*)\"?\\s?\"?(?P<useragent>[^\\\"]*)?\"?"
+    }
+
+    // exported via http://localhost:12345/metrics to Prometheus
+    stage.metrics {
+      metric.histogram {
+        name = "http_status_codes"
+        prefix = "taler_requests_"
+        description = "HTTP status codes, reported from Nginx (all requests)"
+        source = "status"
+        max_idle_duration = "24h"
+        buckets = [100,199,200,201,202,203,299,300,399,400,401,402,403,404,405,406,407,408,409,410,411,418,419,420,450,451,452,499,500,599]
+      }
+
+      // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block
+      metric.counter {
+        name = "total_requests"
+        prefix = "taler_requests_"
+        description = "Total Requests"
+        match_all = true
+        action = "inc"
+      }
+    }
+
     stage.drop {
         source = "http_logs"
         expression  = ".*GET.* 200 .*"
@@ -47,64 +104,40 @@ loki.process "filter_logs" {
     forward_to = [loki.write.grafana_loki.receiver]
 }
 
-// Push the logs to loki
-// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
-loki.write "grafana_loki" {
-    endpoint {
-        url = "https://loki.taler-systems.com/loki/api/v1/push"
-        authorization {
-          type = "Bearer"
-          credentials = "{{ LOKI_ACCESS_TOKEN }}"
-        }
-    }
-}
-
-// This was in the defaults, FIXME: not sure what it does...
-prometheus.exporter.unix "default" {
-  include_exporter_metrics = true
-  disable_collectors       = ["mdadm"]
-}
 
-// This was in the defaults, FIXME: not sure what it does...
-prometheus.scrape "default" {
-  targets = array.concat(
-    prometheus.exporter.unix.default.targets,
-    [{
-      // Self-collect metrics
-      job         = "alloy",
-      __address__ = "127.0.0.1:12345",
-    }],
-  )
-
-  forward_to = [
-  // TODO: components to forward metrics to (like prometheus.remote_write or
-  // prometheus.relabel).
-  ]
-}
-
-
-loki.source.file "nginx_taler_performance_logs" {
-  targets    = [{
+// Monitor the logs with the latency statistics
+local.file_match "nginx_taler_performance_logs" {
+  path_targets    = [{
     __path__  = "/var/log/nginx/*.tal",
     job       = "nginx/performance",
   }]
-  forward_to    = [loki.process.perf_logs.receiver]
+  sync_period = "5s"
 }
 
 
-# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/
+// Connect nginx_taler_performance_logs as source to perf_logs
+// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
+loki.source.file "perf_scrape" {
+   targets    = local.file_match.nginx_taler_performance_logs.targets
+   forward_to = [loki.process.perf_logs.receiver]
+   tail_from_end = true
+}
+
+
+
+// Here we export the *.tal logs with the Nginx latency data.
+// https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/
 loki.process "perf_logs" {
 
-# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stageregex-block
+  // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stageregex-block
   stage.regex {
     expression  = "uri=/(?P<ep>[a-zA-Z]+)(?:/\\w+)?(?:/(?P<act>[a-zA-Z-]+))? s=(?P<status>\\d{3}).*urt=(?P<urt>\\d+\\.\\d+|-) rt=(?P<response_time>\\d+\\.\\d+) rl=(?P<request_length>\\d+) bs=(?P<bytes_sent>\\d+)"
   }
 
-#
-# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stagetemplate-block
+  // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stagetemplate-block
   stage.template {
     source = "endpoint"
-    template = "{{ '{{' }} printf \"%s-%s\" .ep .act | trimSuffix "-" {{ '}}' }}"
+    template = "{{ '{{' }} printf \"%s-%s\" .ep .act | trimSuffix \"-\" {{ '}}' }}"
   }
 
   stage.template {
@@ -112,30 +145,23 @@ loki.process "perf_logs" {
     template = "{{ '{{' }} .urt | replace \"-\" \"0\" {{ '}}' }}"
   }
 
+    // exported via http://localhost:12345/metrics to Prometheus
   stage.metrics {
-# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block
-    metric.counter {
-      name = "total_requests"
-      prefix = "taler_requests_"
-      description = "Total Requests"
-      match_all = true
-      action = "inc"
-    }
+    // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block
     metric.gauge {
        name = "response_time"
        prefix = "taler_requests_"
-       description = "Time taken for Nginx to respond"
+       description = "Time taken for Nginx to respond (non-GET requests)"
        source = "response_time"
        max_idle_duration = "24h"
        action = "set"
     }
-
-# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metrichistogram-block
-# https://www.robustperception.io/how-does-a-prometheus-histogram-work
+    // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metrichistogram-block
+    // https://www.robustperception.io/how-does-a-prometheus-histogram-work
     metric.histogram {
       name = "request_length_hist"
       prefix = "taler_requests_"
-      description = "Request Length reported from Nginx"
+      description = "Request Length reported from Nginx (non-GET requests)"
       source = "request_length"
       max_idle_duration = "24h"
       buckets = [1,10,50,100,200,500,1000,2000,5000]
@@ -144,7 +170,7 @@ loki.process "perf_logs" {
     metric.histogram {
       name = "bytes_sent_hist"
       prefix = "taler_requests_"
-      description = "Number of bytes sent, reported from Nginx"
+      description = "Number of bytes sent, reported from Nginx (non-GET requests)"
       source = "bytes_sent"
       max_idle_duration = "24h"
       buckets = [1,10,50,100,200,500,1000,2000,5000]
@@ -152,7 +178,7 @@ loki.process "perf_logs" {
     metric.histogram {
       name = "response_time_hist"
       prefix = "taler_requests_"
-      description = "Time taken for Nginx to respond"
+      description = "Time taken for Nginx to respond (non-GET requests)"
       source = "response_time"
       max_idle_duration = "24h"
       buckets = [0.001,0.0025,0.005,0.010,0.025,0.050,0.1,0.25,0.5,1,2,5]
@@ -160,12 +186,115 @@ loki.process "perf_logs" {
     metric.histogram {
       name = "upstream_response_time_hist"
       prefix = "taler_requests_"
-      description = "Time taken for the Exchange to respond to Nginx"
+      description = "Time taken for the Exchange to respond to Nginx (non-GET requests)"
       source = "upstream_response_time"
       max_idle_duration = "24h"
       buckets = [0.001,0.0025,0.005,0.010,0.025,0.050,0.1,0.25,0.5,1,2,5]
     }
 
+  }
+  // Finally, pass on to Loki
   forward_to  = [loki.write.grafana_loki.receiver]
 
-}
-\ No newline at end of file
+}
+
+
+// Monitor journald logs
+// Export journald logs to our generic filter
+// but first pass to our generic filter to change labels
+loki.source.journal "read"  {
+  forward_to    = [loki.process.filter_generic_logs.receiver]
+  relabel_rules = loki.relabel.journal.rules
+  max_age       = "12h"
+  labels        = {component = "loki.source.journal"}
+}
+
+
+// https://community.grafana.com/t/scrape-journald-log-with-alloy-docker-container/119896
+loki.relabel "journal" {
+  forward_to = []
+  rule {
+    source_labels = ["__journal__systemd_unit"]
+    target_label  = "systemd_unit"
+  }
+  rule {
+    source_labels = ["__journal__hostname"]
+    target_label = "systemd_hostname"
+  }
+  rule {
+    source_labels = ["__journal__transport"]
+    target_label = "systemd_transport"
+  }
+}
+
+
+// Generic filter for logs
+// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/
+loki.process "filter_generic_logs" {
+    // Determine log level:
+    // https://community.grafana.com/t/extract-log-level-via-regex-and-set-it-as-a-label/134938/5
+    stage.regex {
+      expression = `(?P<level>(?i)\b(info|debug|error|warn|warning|trace|fatal)\b)`
+    }
+
+    // https://grafana.com/docs/alloy/latest/tutorials/processing-logs/
+    // Drop debug
+    stage.drop {
+        source = "level"
+        value = "debug"
+        drop_counter_reason = "boring debugging data"
+    }
+    // Drop info
+    stage.drop {
+        source = "level"
+        value = "info"
+        drop_counter_reason = "boring info logs"
+    }
+    // Drop trace
+    stage.drop {
+        source = "level"
+        value = "trace"
+        drop_counter_reason = "boring info logs"
+    }
+
+    stage.metrics {
+      // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block
+
+      // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metrichistogram-block
+      // https://www.robustperception.io/how-does-a-prometheus-histogram-work
+      metric.counter {
+        name = "warn_log_level"
+        prefix = "system_logs_"
+        description = "Warnings in system logs"
+        source = "level"
+        value = "warn"
+        action = "inc"
+      }
+      metric.counter {
+        name = "warn_log_level"
+        prefix = "system_logs_"
+        description = "Warnings in system logs"
+        source = "level"
+        value = "warning"
+        action = "inc"
+      }
+      metric.counter {
+        name = "error_log_level"
+        prefix = "system_logs_"
+        description = "Errors in system logs"
+        source = "level"
+        value = "error"
+        action = "inc"
+      }
+      metric.counter {
+        name = "fatal_log_level"
+        prefix = "system_logs_"
+        description = "Fatal reports in system logs"
+        source = "level"
+        value = "fatal"
+        action = "inc"
+      }
+    }
+
+    forward_to = [loki.write.grafana_loki.receiver]
+}

	ansible-taler-exchange Ansible playbook to deploy a production Taler Exchange
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	roles/monitoring/files/etc/default/prometheus	\|	2	+-
M	roles/monitoring/files/etc/prometheus/prometheus.yml	\|	5	+++++
M	roles/monitoring/templates/etc/alloy/config.alloy	\|	280	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------