ansible-taler-exchange

Ansible playbook to deploy a production Taler Exchange
Log | Files | Refs | Submodules | README | LICENSE

commit 7d95abf0d91b57efd3b25fa1df97f9b9cb7a057a
parent 4334fc80d205bc20c17d71a986ff93858ef95568
Author: Christian Grothoff <christian@grothoff.org>
Date:   Thu, 30 Jan 2025 12:10:25 +0100

fix up alloy deployment

Diffstat:
Mroles/monitoring/files/etc/default/prometheus | 2+-
Mroles/monitoring/files/etc/prometheus/prometheus.yml | 5+++++
Mroles/monitoring/templates/etc/alloy/config.alloy | 280+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
3 files changed, 210 insertions(+), 77 deletions(-)

diff --git a/roles/monitoring/files/etc/default/prometheus b/roles/monitoring/files/etc/default/prometheus @@ -2,4 +2,4 @@ # Due to shell escaping, to pass backslashes for regexes, you need to double # them (\\d for \d). If running under systemd, you need to double them again # (\\\\d to mean \d), and escape newlines too. -ARGS="--web.listen-address=127.0.0.1:9090" +ARGS="--web.listen-address=127.0.0.1:9090 --enable-feature=remote-write-receiver" diff --git a/roles/monitoring/files/etc/prometheus/prometheus.yml b/roles/monitoring/files/etc/prometheus/prometheus.yml @@ -49,3 +49,8 @@ scrape_configs: - job_name: 'process_exporter' static_configs: - targets: ['localhost:9256'] + + # Job, for Alloy + - job_name: 'alloy_exporter' + static_configs: + - targets: ['localhost:12345'] diff --git a/roles/monitoring/templates/etc/alloy/config.alloy b/roles/monitoring/templates/etc/alloy/config.alloy @@ -1,44 +1,101 @@ -// Sample config for Alloy. -// // For a full configuration reference, see https://grafana.com/docs/alloy logging { level = "warn" } -// Which log files to monitor +// Push the logs to loki +// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ +loki.write "grafana_loki" { + endpoint { + url = "https://loki.taler-systems.com/loki/api/v1/push" + authorization { + type = "Bearer" + credentials = "{{ LOKI_ACCESS_TOKEN }}" + } + } +} + + +// Which log files to monitor: all regular log files with errors local.file_match "local_files" { path_targets = [ {"__path__" = "/var/log/*.log"}, - {"__path__" = "/var/log/nginx/*.err"}, ] sync_period = "5s" } -// Which log files to monitor -local.file_match "http_logs" { + +// Connect local_files as source to filter_generic_logs +// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ +loki.source.file "log_scrape" { + targets = local.file_match.local_files.targets + forward_to = [loki.process.filter_generic_logs.receiver] + tail_from_end = true +} + +// Which log files to monitor: all regular log files with errors +local.file_match "nginx_errors" { path_targets = [ - {"__path__" = "/var/log/nginx/*.log"}, + {"__path__" = "/var/log/nginx/*.err"}, ] sync_period = "5s" } -// Connect local_files as source to filter_logs +// Connect nginx_errors directly to loki // See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ -loki.source.file "log_scrape" { - targets = local.file_match.local_files.targets - forward_to = [loki.process.filter_logs.receiver] +loki.source.file "nginx_error_scrape" { + targets = local.file_match.nginx_errors.targets + forward_to = [loki.write.grafana_loki.receiver] tail_from_end = true } -loki.source.file "web_scrape" { + +// Which log files to monitor: nginx regular logs +local.file_match "http_logs" { + path_targets = [ + {"__path__" = "/var/log/nginx/*.log"}, + ] + sync_period = "5s" +} + +// Connect http_files as source to filter_http +// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ +loki.source.file "http_scrape" { targets = local.file_match.http_logs.targets - forward_to = [loki.process.filter_logs.receiver] + forward_to = [loki.process.filter_http.receiver] tail_from_end = true } -// Filter the logs +// Filter the HTTP logs // See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ -loki.process "filter_logs" { +loki.process "filter_http" { + + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stageregex-block + stage.regex { + expression = "(?P<ip>\\S+) (?P<identd>\\S+) (?P<user>\\S+) \\[(?P<timestamp>[\\w:\\/]+\\s[+\\\\-]\\d{4})\\] \"(?P<action>\\S+)\\s?(?P<path>\\S+)\\s?(?P<protocol>\\S+)?\" (?P<status>\\d{3}|-) (?P<size>\\d+|-)\\s?\"?(?P<referrer>[^\\\"]*)\"?\\s?\"?(?P<useragent>[^\\\"]*)?\"?" + } + + // exported via http://localhost:12345/metrics to Prometheus + stage.metrics { + metric.histogram { + name = "http_status_codes" + prefix = "taler_requests_" + description = "HTTP status codes, reported from Nginx (all requests)" + source = "status" + max_idle_duration = "24h" + buckets = [100,199,200,201,202,203,299,300,399,400,401,402,403,404,405,406,407,408,409,410,411,418,419,420,450,451,452,499,500,599] + } + + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block + metric.counter { + name = "total_requests" + prefix = "taler_requests_" + description = "Total Requests" + match_all = true + action = "inc" + } + } + stage.drop { source = "http_logs" expression = ".*GET.* 200 .*" @@ -47,64 +104,40 @@ loki.process "filter_logs" { forward_to = [loki.write.grafana_loki.receiver] } -// Push the logs to loki -// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ -loki.write "grafana_loki" { - endpoint { - url = "https://loki.taler-systems.com/loki/api/v1/push" - authorization { - type = "Bearer" - credentials = "{{ LOKI_ACCESS_TOKEN }}" - } - } -} - -// This was in the defaults, FIXME: not sure what it does... -prometheus.exporter.unix "default" { - include_exporter_metrics = true - disable_collectors = ["mdadm"] -} -// This was in the defaults, FIXME: not sure what it does... -prometheus.scrape "default" { - targets = array.concat( - prometheus.exporter.unix.default.targets, - [{ - // Self-collect metrics - job = "alloy", - __address__ = "127.0.0.1:12345", - }], - ) - - forward_to = [ - // TODO: components to forward metrics to (like prometheus.remote_write or - // prometheus.relabel). - ] -} - - -loki.source.file "nginx_taler_performance_logs" { - targets = [{ +// Monitor the logs with the latency statistics +local.file_match "nginx_taler_performance_logs" { + path_targets = [{ __path__ = "/var/log/nginx/*.tal", job = "nginx/performance", }] - forward_to = [loki.process.perf_logs.receiver] + sync_period = "5s" } -# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/ +// Connect nginx_taler_performance_logs as source to perf_logs +// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ +loki.source.file "perf_scrape" { + targets = local.file_match.nginx_taler_performance_logs.targets + forward_to = [loki.process.perf_logs.receiver] + tail_from_end = true +} + + + +// Here we export the *.tal logs with the Nginx latency data. +// https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/ loki.process "perf_logs" { -# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stageregex-block + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stageregex-block stage.regex { expression = "uri=/(?P<ep>[a-zA-Z]+)(?:/\\w+)?(?:/(?P<act>[a-zA-Z-]+))? s=(?P<status>\\d{3}).*urt=(?P<urt>\\d+\\.\\d+|-) rt=(?P<response_time>\\d+\\.\\d+) rl=(?P<request_length>\\d+) bs=(?P<bytes_sent>\\d+)" } -# -# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stagetemplate-block + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#stagetemplate-block stage.template { source = "endpoint" - template = "{{ '{{' }} printf \"%s-%s\" .ep .act | trimSuffix "-" {{ '}}' }}" + template = "{{ '{{' }} printf \"%s-%s\" .ep .act | trimSuffix \"-\" {{ '}}' }}" } stage.template { @@ -112,30 +145,23 @@ loki.process "perf_logs" { template = "{{ '{{' }} .urt | replace \"-\" \"0\" {{ '}}' }}" } + // exported via http://localhost:12345/metrics to Prometheus stage.metrics { -# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block - metric.counter { - name = "total_requests" - prefix = "taler_requests_" - description = "Total Requests" - match_all = true - action = "inc" - } + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block metric.gauge { name = "response_time" prefix = "taler_requests_" - description = "Time taken for Nginx to respond" + description = "Time taken for Nginx to respond (non-GET requests)" source = "response_time" max_idle_duration = "24h" action = "set" } - -# https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metrichistogram-block -# https://www.robustperception.io/how-does-a-prometheus-histogram-work + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metrichistogram-block + // https://www.robustperception.io/how-does-a-prometheus-histogram-work metric.histogram { name = "request_length_hist" prefix = "taler_requests_" - description = "Request Length reported from Nginx" + description = "Request Length reported from Nginx (non-GET requests)" source = "request_length" max_idle_duration = "24h" buckets = [1,10,50,100,200,500,1000,2000,5000] @@ -144,7 +170,7 @@ loki.process "perf_logs" { metric.histogram { name = "bytes_sent_hist" prefix = "taler_requests_" - description = "Number of bytes sent, reported from Nginx" + description = "Number of bytes sent, reported from Nginx (non-GET requests)" source = "bytes_sent" max_idle_duration = "24h" buckets = [1,10,50,100,200,500,1000,2000,5000] @@ -152,7 +178,7 @@ loki.process "perf_logs" { metric.histogram { name = "response_time_hist" prefix = "taler_requests_" - description = "Time taken for Nginx to respond" + description = "Time taken for Nginx to respond (non-GET requests)" source = "response_time" max_idle_duration = "24h" buckets = [0.001,0.0025,0.005,0.010,0.025,0.050,0.1,0.25,0.5,1,2,5] @@ -160,12 +186,115 @@ loki.process "perf_logs" { metric.histogram { name = "upstream_response_time_hist" prefix = "taler_requests_" - description = "Time taken for the Exchange to respond to Nginx" + description = "Time taken for the Exchange to respond to Nginx (non-GET requests)" source = "upstream_response_time" max_idle_duration = "24h" buckets = [0.001,0.0025,0.005,0.010,0.025,0.050,0.1,0.25,0.5,1,2,5] } + } + // Finally, pass on to Loki forward_to = [loki.write.grafana_loki.receiver] -} -\ No newline at end of file +} + + +// Monitor journald logs +// Export journald logs to our generic filter +// but first pass to our generic filter to change labels +loki.source.journal "read" { + forward_to = [loki.process.filter_generic_logs.receiver] + relabel_rules = loki.relabel.journal.rules + max_age = "12h" + labels = {component = "loki.source.journal"} +} + + +// https://community.grafana.com/t/scrape-journald-log-with-alloy-docker-container/119896 +loki.relabel "journal" { + forward_to = [] + rule { + source_labels = ["__journal__systemd_unit"] + target_label = "systemd_unit" + } + rule { + source_labels = ["__journal__hostname"] + target_label = "systemd_hostname" + } + rule { + source_labels = ["__journal__transport"] + target_label = "systemd_transport" + } +} + + +// Generic filter for logs +// See: https://grafana.com/docs/alloy/latest/tutorials/send-logs-to-loki/ +loki.process "filter_generic_logs" { + // Determine log level: + // https://community.grafana.com/t/extract-log-level-via-regex-and-set-it-as-a-label/134938/5 + stage.regex { + expression = `(?P<level>(?i)\b(info|debug|error|warn|warning|trace|fatal)\b)` + } + + // https://grafana.com/docs/alloy/latest/tutorials/processing-logs/ + // Drop debug + stage.drop { + source = "level" + value = "debug" + drop_counter_reason = "boring debugging data" + } + // Drop info + stage.drop { + source = "level" + value = "info" + drop_counter_reason = "boring info logs" + } + // Drop trace + stage.drop { + source = "level" + value = "trace" + drop_counter_reason = "boring info logs" + } + + stage.metrics { + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metriccounter-block + + // https://grafana.com/docs/alloy/latest/reference/components/loki/loki.process/#metrichistogram-block + // https://www.robustperception.io/how-does-a-prometheus-histogram-work + metric.counter { + name = "warn_log_level" + prefix = "system_logs_" + description = "Warnings in system logs" + source = "level" + value = "warn" + action = "inc" + } + metric.counter { + name = "warn_log_level" + prefix = "system_logs_" + description = "Warnings in system logs" + source = "level" + value = "warning" + action = "inc" + } + metric.counter { + name = "error_log_level" + prefix = "system_logs_" + description = "Errors in system logs" + source = "level" + value = "error" + action = "inc" + } + metric.counter { + name = "fatal_log_level" + prefix = "system_logs_" + description = "Fatal reports in system logs" + source = "level" + value = "fatal" + action = "inc" + } + } + + forward_to = [loki.write.grafana_loki.receiver] +}