From 4774aaf20ad72acbe85b0042d98d5619dcd033db Mon Sep 17 00:00:00 2001
From: Boss Marco <bossm8@bfh.ch>
Date: Sun, 15 May 2022 10:35:41 +0200
Subject: update auditor pg conf

---
 additional/grafana/load-statistics.json |  24 ++----
 additional/grafana/proxy.json           |  16 ++--
 additional/grafana/transactions.json    |   4 +-
 experiment/scripts/auditor.sh           | 131 +++++++++++++++++++++++++++++++-
 4 files changed, 144 insertions(+), 31 deletions(-)

diff --git a/additional/grafana/load-statistics.json b/additional/grafana/load-statistics.json
index 6fef09f..9d5ee29 100644
--- a/additional/grafana/load-statistics.json
+++ b/additional/grafana/load-statistics.json
@@ -630,8 +630,8 @@
         "overrides": [
           {
             "matcher": {
-              "id": "byName",
-              "options": "Database Size"
+              "id": "byRegexp",
+              "options": "Database(.*)"
             },
             "properties": [
               {
@@ -643,18 +643,6 @@
                 "value": "bytes"
               }
             ]
-          },
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "Total Requests"
-            },
-            "properties": [
-              {
-                "id": "unit",
-                "value": "short"
-              }
-            ]
           }
         ]
       },
@@ -696,10 +684,10 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "exemplar": true,
-          "expr": "sum(pg_database_size_bytes{instance=\"127.0.0.1:9187\"})",
+          "expr": "sum by (datname) (pg_database_size_bytes{instance=\"127.0.0.1:9187\", datname=~\"taler-exchange|taler-ingress\"})",
           "hide": false,
           "interval": "",
-          "legendFormat": "Database Size",
+          "legendFormat": "Database Size {{ datname }}",
           "refId": "C"
         }
       ],
@@ -1599,7 +1587,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "exemplar": true,
-          "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))",
+          "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))",
           "hide": false,
           "interval": "",
           "legendFormat": "Average Response Time",
@@ -1628,6 +1616,6 @@
   "timezone": "",
   "title": "Load Statistics",
   "uid": "rkyhDAt7z",
-  "version": 72,
+  "version": 75,
   "weekStart": ""
 }
\ No newline at end of file
diff --git a/additional/grafana/proxy.json b/additional/grafana/proxy.json
index ee2949a..58f94bb 100644
--- a/additional/grafana/proxy.json
+++ b/additional/grafana/proxy.json
@@ -15,7 +15,7 @@
       "type": "grafana",
       "id": "grafana",
       "name": "Grafana",
-      "version": "8.4.3"
+      "version": "8.4.6"
     },
     {
       "type": "panel",
@@ -61,7 +61,7 @@
   "gnetId": 12708,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1649441826256,
+  "iteration": 1652602026981,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -122,7 +122,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "8.4.3",
+      "pluginVersion": "8.4.6",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -220,7 +220,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "8.4.3",
+      "pluginVersion": "8.4.6",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -326,7 +326,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "8.4.3",
+      "pluginVersion": "8.4.6",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -451,7 +451,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "exemplar": true,
-          "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))",
+          "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))",
           "interval": "",
           "legendFormat": "Response Time",
           "refId": "A"
@@ -538,7 +538,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "exemplar": true,
-          "expr": "avg ((rate(taler_requests_upstream_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_upstream_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))",
+          "expr": "avg ((rate(taler_requests_upstream_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_upstream_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))",
           "interval": "",
           "legendFormat": "Upstream Response Time",
           "refId": "A"
@@ -635,6 +635,6 @@
   "timezone": "",
   "title": "Proxy",
   "uid": "MsjffzSZz",
-  "version": 16,
+  "version": 18,
   "weekStart": ""
 }
\ No newline at end of file
diff --git a/additional/grafana/transactions.json b/additional/grafana/transactions.json
index 7c17a8a..f1d6a1a 100644
--- a/additional/grafana/transactions.json
+++ b/additional/grafana/transactions.json
@@ -1721,7 +1721,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "exemplar": false,
-          "expr": "(sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"coins-deposit\", status=\"200\"} [5m]) or vector(0))) / on (__name__) (sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"reserves-withdraw\", status=\"200\"} [5m]) or vector(0)) + on (__name__) sum by (__name__) (rate(taler_exchange_batch_withdraw_num_coins{} [2m]) or vector(0)))",
+          "expr": "(sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"coins-deposit\", status=\"200\"} [5m]) or vector(0))) / on (__name__) (sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"reserves-withdraw\", status=\"200\"} [5m]) or vector(0)) + on (__name__) sum by (__name__) (rate(taler_exchange_batch_withdraw_num_coins{} [5m]) or vector(0)))",
           "instant": true,
           "interval": "",
           "legendFormat": "",
@@ -2528,6 +2528,6 @@
   "timezone": "",
   "title": "Transactions",
   "uid": "83vvgKKnk",
-  "version": 162,
+  "version": 163,
   "weekStart": ""
 }
\ No newline at end of file
diff --git a/experiment/scripts/auditor.sh b/experiment/scripts/auditor.sh
index f8a9972..45da0cc 100644
--- a/experiment/scripts/auditor.sh
+++ b/experiment/scripts/auditor.sh
@@ -20,12 +20,137 @@ function create_users() {
 }
 
 function init_db() {
+  # Get hardware info to tune in postgresql.conf
+  SHARED_MEM=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) / 4 ))
+  CACHE_SIZE=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) * 3/4))
+  NUM_CPU=$(lscpu | grep "CPU(s)" | head -n 1 | awk '{print $2}')
+
+  # Enable huge pages
+  # Size for huge_pages =~ shared_buffers * 1.25 so that there is enough
+  VM_PEAK=$((${SHARED_MEM} * 10/8))
+
+  HUGE_PAGES_SIZE=$(grep ^Hugepagesize /proc/meminfo | awk '{print $2}')
+  NUM_PAGES=$((${VM_PEAK} / ${HUGE_PAGES_SIZE}))
+
+  if ! grep -q "vm.nr_hugepages'" /etc/sysctl.conf; then
+    echo "vm.nr_hugepages=${NUM_PAGES}" >> /etc/sysctl.conf
+    sysctl -p
+  fi
+
+  # disable swap
+  swapoff -a
+
   echo "
   listen_addresses='*'
+
   wal_level = logical
-  min_wal_size = 20GB
-  max_wal_size = 200GB
-  synchronous_commit=off
+
+  log_destination=syslog
+  syslog_ident='taler-auditor-db'
+
+  log_error_verbosity=terse
+  # log_min_messages=PANIC
+  # log_min_error_statement=PANIC
+  # client_min_messages=ERROR
+
+  # For pgbadger
+  # log_checkpoints=on
+  # log_connections=on
+  # log_disconnections=on
+  # log_lock_waits=on
+  # log_temp_files=0
+  # log_autovacuum_min_duration=0
+  # log_error_verbosity=default
+  # log_duration=on
+  # log_statement=all
+
+  # For explain.py
+  # syslog_split_messages=off
+  # log_statement=all
+  # log_error_verbosity=default
+
+  # Large tables perform bad with the default settings
+  # However, they could also be set on each table indiviudally
+  # (NOTE: on partitions!)
+  # ALTER TABLE known_coins_default
+  #  SET (autovacuum_vacuum_scale_factor = 0.0, autovacuum_vacuum_threshold = 1000);
+  default_statistics_target=300
+  autovacuum_vacuum_cost_limit=400
+  autovacuum_vacuum_scale_factor=0.1
+  autovacuum_vacuum_threshold=1000
+  # Default 50, 0.1
+  autovacuum_analyze_threshold=50
+  autovacuum_analyze_scale_factor=0.1
+
+  # use 25% of the available memory 
+  # (https://www.postgresql.org/docs/13/runtime-config-resource.html)
+  shared_buffers=${SHARED_MEM}kB
+  effective_cache_size=${CACHE_SIZE}kB
+
+  huge_pages=on
+  
+  # (https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-MAX-WAL-SIZE)
+  min_wal_size=20GB
+  max_wal_size=200GB
+  wal_buffers=1GB
+
+  checkpoint_completion_target=0.9
+  checkpoint_timeout = 15min
+  checkpoint_flush_after = 2MB
+  random_page_cost=1.1
+  
+  # Default 2kB
+  bgwriter_flush_after = 2MB
+
+  # Default 0
+  # https://www.cybertec-postgresql.com/en/the-mysterious-backend_flush_after-configuration-setting/
+  backend_flush_after = 2MB
+
+  # Too much results in CPU load
+  # https://www.postgresql.org/docs/13/runtime-config-resource.html#GUC-EFFECTIVE-IO-CONCURRENCY
+  effective_io_concurrency = 200
+
+  # Bad when turned off - Recovering db may not be possible
+  # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-FSYNC
+  fsync = on
+
+  # Not so bad as when turning off fsync, but single transactions might get lost on crash - but
+  # like they would have aborted cleanly
+  # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-SYNCHRONOUS-COMMIT
+  # When having replication, this one can be changed (in local only on and off are of use)
+  # https://www.postgresql.org/docs/13/runtime-config-replication.html#GUC-SYNCHRONOUS-STANDBY-NAMES
+  # on causes us to get 100% IO load
+  synchronous_commit = off
+
+  # Default off
+  wal_compression = off
+
+  wal_sync_method = fsync
+
+  # Bad to turn off, may lead to inconcistency
+  # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-FULL-PAGE-WRITES
+  # Write full pages to WAL while partial are written to disk, helpful in case of crash, then
+  # partially written pages can be recovered.
+  # Can be disabled if FS does not support partial written pages (such as ZFS)
+  full_page_writes = on
+
+  max_worker_processes=${NUM_CPU}
+  max_parallel_workers=${NUM_CPU}
+  max_parallel_workers_per_gather=10
+  max_connections=500
+
+  max_parallel_maintenance_workers=12
+  
+  # out of shared memory
+  max_locks_per_transaction=85
+  
+  # (max used =~ work_mem * max_connections)
+  # NOTE: This formula is not completely correct 
+  work_mem=2GB
+  maintenance_work_mem=4GB
+
+  max_logical_replication_workers=${NUM_CPU}
+  max_sync_workers_per_subscription=${NUM_CPU}
   " > /etc/postgresql/${POSTGRES_VERSION}/main/auditor.conf
 
   echo "
-- 
cgit v1.2.3