From 4774aaf20ad72acbe85b0042d98d5619dcd033db Mon Sep 17 00:00:00 2001 From: Boss Marco Date: Sun, 15 May 2022 10:35:41 +0200 Subject: update auditor pg conf --- additional/grafana/load-statistics.json | 24 ++---- additional/grafana/proxy.json | 16 ++-- additional/grafana/transactions.json | 4 +- experiment/scripts/auditor.sh | 131 +++++++++++++++++++++++++++++++- 4 files changed, 144 insertions(+), 31 deletions(-) diff --git a/additional/grafana/load-statistics.json b/additional/grafana/load-statistics.json index 6fef09f..9d5ee29 100644 --- a/additional/grafana/load-statistics.json +++ b/additional/grafana/load-statistics.json @@ -630,8 +630,8 @@ "overrides": [ { "matcher": { - "id": "byName", - "options": "Database Size" + "id": "byRegexp", + "options": "Database(.*)" }, "properties": [ { @@ -643,18 +643,6 @@ "value": "bytes" } ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Requests" - }, - "properties": [ - { - "id": "unit", - "value": "short" - } - ] } ] }, @@ -696,10 +684,10 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(pg_database_size_bytes{instance=\"127.0.0.1:9187\"})", + "expr": "sum by (datname) (pg_database_size_bytes{instance=\"127.0.0.1:9187\", datname=~\"taler-exchange|taler-ingress\"})", "hide": false, "interval": "", - "legendFormat": "Database Size", + "legendFormat": "Database Size {{ datname }}", "refId": "C" } ], @@ -1599,7 +1587,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))", + "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))", "hide": false, "interval": "", "legendFormat": "Average Response Time", @@ -1628,6 +1616,6 @@ "timezone": "", "title": "Load Statistics", "uid": "rkyhDAt7z", - "version": 72, + "version": 75, "weekStart": "" } \ No newline at end of file diff --git a/additional/grafana/proxy.json b/additional/grafana/proxy.json index ee2949a..58f94bb 100644 --- a/additional/grafana/proxy.json +++ b/additional/grafana/proxy.json @@ -15,7 +15,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.4.3" + "version": "8.4.6" }, { "type": "panel", @@ -61,7 +61,7 @@ "gnetId": 12708, "graphTooltip": 0, "id": null, - "iteration": 1649441826256, + "iteration": 1652602026981, "links": [], "liveNow": false, "panels": [ @@ -122,7 +122,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.4.3", + "pluginVersion": "8.4.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -220,7 +220,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.4.3", + "pluginVersion": "8.4.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -326,7 +326,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.4.3", + "pluginVersion": "8.4.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -451,7 +451,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))", + "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))", "interval": "", "legendFormat": "Response Time", "refId": "A" @@ -538,7 +538,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "avg ((rate(taler_requests_upstream_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_upstream_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))", + "expr": "avg ((rate(taler_requests_upstream_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_upstream_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))", "interval": "", "legendFormat": "Upstream Response Time", "refId": "A" @@ -635,6 +635,6 @@ "timezone": "", "title": "Proxy", "uid": "MsjffzSZz", - "version": 16, + "version": 18, "weekStart": "" } \ No newline at end of file diff --git a/additional/grafana/transactions.json b/additional/grafana/transactions.json index 7c17a8a..f1d6a1a 100644 --- a/additional/grafana/transactions.json +++ b/additional/grafana/transactions.json @@ -1721,7 +1721,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": false, - "expr": "(sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"coins-deposit\", status=\"200\"} [5m]) or vector(0))) / on (__name__) (sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"reserves-withdraw\", status=\"200\"} [5m]) or vector(0)) + on (__name__) sum by (__name__) (rate(taler_exchange_batch_withdraw_num_coins{} [2m]) or vector(0)))", + "expr": "(sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"coins-deposit\", status=\"200\"} [5m]) or vector(0))) / on (__name__) (sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"reserves-withdraw\", status=\"200\"} [5m]) or vector(0)) + on (__name__) sum by (__name__) (rate(taler_exchange_batch_withdraw_num_coins{} [5m]) or vector(0)))", "instant": true, "interval": "", "legendFormat": "", @@ -2528,6 +2528,6 @@ "timezone": "", "title": "Transactions", "uid": "83vvgKKnk", - "version": 162, + "version": 163, "weekStart": "" } \ No newline at end of file diff --git a/experiment/scripts/auditor.sh b/experiment/scripts/auditor.sh index f8a9972..45da0cc 100644 --- a/experiment/scripts/auditor.sh +++ b/experiment/scripts/auditor.sh @@ -20,12 +20,137 @@ function create_users() { } function init_db() { + # Get hardware info to tune in postgresql.conf + SHARED_MEM=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) / 4 )) + CACHE_SIZE=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) * 3/4)) + NUM_CPU=$(lscpu | grep "CPU(s)" | head -n 1 | awk '{print $2}') + + # Enable huge pages + # Size for huge_pages =~ shared_buffers * 1.25 so that there is enough + VM_PEAK=$((${SHARED_MEM} * 10/8)) + + HUGE_PAGES_SIZE=$(grep ^Hugepagesize /proc/meminfo | awk '{print $2}') + NUM_PAGES=$((${VM_PEAK} / ${HUGE_PAGES_SIZE})) + + if ! grep -q "vm.nr_hugepages'" /etc/sysctl.conf; then + echo "vm.nr_hugepages=${NUM_PAGES}" >> /etc/sysctl.conf + sysctl -p + fi + + # disable swap + swapoff -a + echo " listen_addresses='*' + wal_level = logical - min_wal_size = 20GB - max_wal_size = 200GB - synchronous_commit=off + + log_destination=syslog + syslog_ident='taler-auditor-db' + + log_error_verbosity=terse + # log_min_messages=PANIC + # log_min_error_statement=PANIC + # client_min_messages=ERROR + + # For pgbadger + # log_checkpoints=on + # log_connections=on + # log_disconnections=on + # log_lock_waits=on + # log_temp_files=0 + # log_autovacuum_min_duration=0 + # log_error_verbosity=default + # log_duration=on + # log_statement=all + + # For explain.py + # syslog_split_messages=off + # log_statement=all + # log_error_verbosity=default + + # Large tables perform bad with the default settings + # However, they could also be set on each table indiviudally + # (NOTE: on partitions!) + # ALTER TABLE known_coins_default + # SET (autovacuum_vacuum_scale_factor = 0.0, autovacuum_vacuum_threshold = 1000); + default_statistics_target=300 + autovacuum_vacuum_cost_limit=400 + autovacuum_vacuum_scale_factor=0.1 + autovacuum_vacuum_threshold=1000 + # Default 50, 0.1 + autovacuum_analyze_threshold=50 + autovacuum_analyze_scale_factor=0.1 + + # use 25% of the available memory + # (https://www.postgresql.org/docs/13/runtime-config-resource.html) + shared_buffers=${SHARED_MEM}kB + effective_cache_size=${CACHE_SIZE}kB + + huge_pages=on + + # (https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-MAX-WAL-SIZE) + min_wal_size=20GB + max_wal_size=200GB + wal_buffers=1GB + + checkpoint_completion_target=0.9 + checkpoint_timeout = 15min + checkpoint_flush_after = 2MB + random_page_cost=1.1 + + # Default 2kB + bgwriter_flush_after = 2MB + + # Default 0 + # https://www.cybertec-postgresql.com/en/the-mysterious-backend_flush_after-configuration-setting/ + backend_flush_after = 2MB + + # Too much results in CPU load + # https://www.postgresql.org/docs/13/runtime-config-resource.html#GUC-EFFECTIVE-IO-CONCURRENCY + effective_io_concurrency = 200 + + # Bad when turned off - Recovering db may not be possible + # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-FSYNC + fsync = on + + # Not so bad as when turning off fsync, but single transactions might get lost on crash - but + # like they would have aborted cleanly + # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-SYNCHRONOUS-COMMIT + # When having replication, this one can be changed (in local only on and off are of use) + # https://www.postgresql.org/docs/13/runtime-config-replication.html#GUC-SYNCHRONOUS-STANDBY-NAMES + # on causes us to get 100% IO load + synchronous_commit = off + + # Default off + wal_compression = off + + wal_sync_method = fsync + + # Bad to turn off, may lead to inconcistency + # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-FULL-PAGE-WRITES + # Write full pages to WAL while partial are written to disk, helpful in case of crash, then + # partially written pages can be recovered. + # Can be disabled if FS does not support partial written pages (such as ZFS) + full_page_writes = on + + max_worker_processes=${NUM_CPU} + max_parallel_workers=${NUM_CPU} + max_parallel_workers_per_gather=10 + max_connections=500 + + max_parallel_maintenance_workers=12 + + # out of shared memory + max_locks_per_transaction=85 + + # (max used =~ work_mem * max_connections) + # NOTE: This formula is not completely correct + work_mem=2GB + maintenance_work_mem=4GB + + max_logical_replication_workers=${NUM_CPU} + max_sync_workers_per_subscription=${NUM_CPU} " > /etc/postgresql/${POSTGRES_VERSION}/main/auditor.conf echo " -- cgit v1.2.3