summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoss Marco <bossm8@bfh.ch>2022-05-15 10:35:41 +0200
committerBoss Marco <bossm8@bfh.ch>2022-05-15 10:35:41 +0200
commit4774aaf20ad72acbe85b0042d98d5619dcd033db (patch)
tree016742a33ecdb5c8588c81691283414c790fa8a8
parent51125241e19a7823cb56aa640098d1466a9fffb9 (diff)
downloadgrid5k-4774aaf20ad72acbe85b0042d98d5619dcd033db.tar.gz
grid5k-4774aaf20ad72acbe85b0042d98d5619dcd033db.tar.bz2
grid5k-4774aaf20ad72acbe85b0042d98d5619dcd033db.zip
update auditor pg conf
-rw-r--r--additional/grafana/load-statistics.json24
-rw-r--r--additional/grafana/proxy.json16
-rw-r--r--additional/grafana/transactions.json4
-rw-r--r--experiment/scripts/auditor.sh131
4 files changed, 144 insertions, 31 deletions
diff --git a/additional/grafana/load-statistics.json b/additional/grafana/load-statistics.json
index 6fef09f..9d5ee29 100644
--- a/additional/grafana/load-statistics.json
+++ b/additional/grafana/load-statistics.json
@@ -630,8 +630,8 @@
"overrides": [
{
"matcher": {
- "id": "byName",
- "options": "Database Size"
+ "id": "byRegexp",
+ "options": "Database(.*)"
},
"properties": [
{
@@ -643,18 +643,6 @@
"value": "bytes"
}
]
- },
- {
- "matcher": {
- "id": "byName",
- "options": "Total Requests"
- },
- "properties": [
- {
- "id": "unit",
- "value": "short"
- }
- ]
}
]
},
@@ -696,10 +684,10 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
- "expr": "sum(pg_database_size_bytes{instance=\"127.0.0.1:9187\"})",
+ "expr": "sum by (datname) (pg_database_size_bytes{instance=\"127.0.0.1:9187\", datname=~\"taler-exchange|taler-ingress\"})",
"hide": false,
"interval": "",
- "legendFormat": "Database Size",
+ "legendFormat": "Database Size {{ datname }}",
"refId": "C"
}
],
@@ -1599,7 +1587,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
- "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))",
+ "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))",
"hide": false,
"interval": "",
"legendFormat": "Average Response Time",
@@ -1628,6 +1616,6 @@
"timezone": "",
"title": "Load Statistics",
"uid": "rkyhDAt7z",
- "version": 72,
+ "version": 75,
"weekStart": ""
} \ No newline at end of file
diff --git a/additional/grafana/proxy.json b/additional/grafana/proxy.json
index ee2949a..58f94bb 100644
--- a/additional/grafana/proxy.json
+++ b/additional/grafana/proxy.json
@@ -15,7 +15,7 @@
"type": "grafana",
"id": "grafana",
"name": "Grafana",
- "version": "8.4.3"
+ "version": "8.4.6"
},
{
"type": "panel",
@@ -61,7 +61,7 @@
"gnetId": 12708,
"graphTooltip": 0,
"id": null,
- "iteration": 1649441826256,
+ "iteration": 1652602026981,
"links": [],
"liveNow": false,
"panels": [
@@ -122,7 +122,7 @@
"alertThreshold": true
},
"percentage": false,
- "pluginVersion": "8.4.3",
+ "pluginVersion": "8.4.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -220,7 +220,7 @@
"alertThreshold": true
},
"percentage": false,
- "pluginVersion": "8.4.3",
+ "pluginVersion": "8.4.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -326,7 +326,7 @@
"alertThreshold": true
},
"percentage": false,
- "pluginVersion": "8.4.3",
+ "pluginVersion": "8.4.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -451,7 +451,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
- "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))",
+ "expr": "avg ((rate(taler_requests_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))",
"interval": "",
"legendFormat": "Response Time",
"refId": "A"
@@ -538,7 +538,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
- "expr": "avg ((rate(taler_requests_upstream_response_time_hist_sum{endpoint!~\"keys|terms\", status=\"200\"} [2m]) ) / (rate(taler_requests_upstream_response_time_hist_count{endpoint!~\"keys|terms\", status=\"200\"} [2m])))",
+ "expr": "avg ((rate(taler_requests_upstream_response_time_hist_sum{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m]) ) / (rate(taler_requests_upstream_response_time_hist_count{endpoint!~\"keys|terms|wire\", status=\"200\"} [2m])))",
"interval": "",
"legendFormat": "Upstream Response Time",
"refId": "A"
@@ -635,6 +635,6 @@
"timezone": "",
"title": "Proxy",
"uid": "MsjffzSZz",
- "version": 16,
+ "version": 18,
"weekStart": ""
} \ No newline at end of file
diff --git a/additional/grafana/transactions.json b/additional/grafana/transactions.json
index 7c17a8a..f1d6a1a 100644
--- a/additional/grafana/transactions.json
+++ b/additional/grafana/transactions.json
@@ -1721,7 +1721,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": false,
- "expr": "(sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"coins-deposit\", status=\"200\"} [5m]) or vector(0))) / on (__name__) (sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"reserves-withdraw\", status=\"200\"} [5m]) or vector(0)) + on (__name__) sum by (__name__) (rate(taler_exchange_batch_withdraw_num_coins{} [2m]) or vector(0)))",
+ "expr": "(sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"coins-deposit\", status=\"200\"} [5m]) or vector(0))) / on (__name__) (sum by (__name__) (rate(taler_requests_total_requests{endpoint=\"reserves-withdraw\", status=\"200\"} [5m]) or vector(0)) + on (__name__) sum by (__name__) (rate(taler_exchange_batch_withdraw_num_coins{} [5m]) or vector(0)))",
"instant": true,
"interval": "",
"legendFormat": "",
@@ -2528,6 +2528,6 @@
"timezone": "",
"title": "Transactions",
"uid": "83vvgKKnk",
- "version": 162,
+ "version": 163,
"weekStart": ""
} \ No newline at end of file
diff --git a/experiment/scripts/auditor.sh b/experiment/scripts/auditor.sh
index f8a9972..45da0cc 100644
--- a/experiment/scripts/auditor.sh
+++ b/experiment/scripts/auditor.sh
@@ -20,12 +20,137 @@ function create_users() {
}
function init_db() {
+ # Get hardware info to tune in postgresql.conf
+ SHARED_MEM=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) / 4 ))
+ CACHE_SIZE=$(($(awk '/MemTotal/ {print $2}' /proc/meminfo) * 3/4))
+ NUM_CPU=$(lscpu | grep "CPU(s)" | head -n 1 | awk '{print $2}')
+
+ # Enable huge pages
+ # Size for huge_pages =~ shared_buffers * 1.25 so that there is enough
+ VM_PEAK=$((${SHARED_MEM} * 10/8))
+
+ HUGE_PAGES_SIZE=$(grep ^Hugepagesize /proc/meminfo | awk '{print $2}')
+ NUM_PAGES=$((${VM_PEAK} / ${HUGE_PAGES_SIZE}))
+
+ if ! grep -q "vm.nr_hugepages'" /etc/sysctl.conf; then
+ echo "vm.nr_hugepages=${NUM_PAGES}" >> /etc/sysctl.conf
+ sysctl -p
+ fi
+
+ # disable swap
+ swapoff -a
+
echo "
listen_addresses='*'
+
wal_level = logical
- min_wal_size = 20GB
- max_wal_size = 200GB
- synchronous_commit=off
+
+ log_destination=syslog
+ syslog_ident='taler-auditor-db'
+
+ log_error_verbosity=terse
+ # log_min_messages=PANIC
+ # log_min_error_statement=PANIC
+ # client_min_messages=ERROR
+
+ # For pgbadger
+ # log_checkpoints=on
+ # log_connections=on
+ # log_disconnections=on
+ # log_lock_waits=on
+ # log_temp_files=0
+ # log_autovacuum_min_duration=0
+ # log_error_verbosity=default
+ # log_duration=on
+ # log_statement=all
+
+ # For explain.py
+ # syslog_split_messages=off
+ # log_statement=all
+ # log_error_verbosity=default
+
+ # Large tables perform bad with the default settings
+ # However, they could also be set on each table indiviudally
+ # (NOTE: on partitions!)
+ # ALTER TABLE known_coins_default
+ # SET (autovacuum_vacuum_scale_factor = 0.0, autovacuum_vacuum_threshold = 1000);
+ default_statistics_target=300
+ autovacuum_vacuum_cost_limit=400
+ autovacuum_vacuum_scale_factor=0.1
+ autovacuum_vacuum_threshold=1000
+ # Default 50, 0.1
+ autovacuum_analyze_threshold=50
+ autovacuum_analyze_scale_factor=0.1
+
+ # use 25% of the available memory
+ # (https://www.postgresql.org/docs/13/runtime-config-resource.html)
+ shared_buffers=${SHARED_MEM}kB
+ effective_cache_size=${CACHE_SIZE}kB
+
+ huge_pages=on
+
+ # (https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-MAX-WAL-SIZE)
+ min_wal_size=20GB
+ max_wal_size=200GB
+ wal_buffers=1GB
+
+ checkpoint_completion_target=0.9
+ checkpoint_timeout = 15min
+ checkpoint_flush_after = 2MB
+ random_page_cost=1.1
+
+ # Default 2kB
+ bgwriter_flush_after = 2MB
+
+ # Default 0
+ # https://www.cybertec-postgresql.com/en/the-mysterious-backend_flush_after-configuration-setting/
+ backend_flush_after = 2MB
+
+ # Too much results in CPU load
+ # https://www.postgresql.org/docs/13/runtime-config-resource.html#GUC-EFFECTIVE-IO-CONCURRENCY
+ effective_io_concurrency = 200
+
+ # Bad when turned off - Recovering db may not be possible
+ # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-FSYNC
+ fsync = on
+
+ # Not so bad as when turning off fsync, but single transactions might get lost on crash - but
+ # like they would have aborted cleanly
+ # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-SYNCHRONOUS-COMMIT
+ # When having replication, this one can be changed (in local only on and off are of use)
+ # https://www.postgresql.org/docs/13/runtime-config-replication.html#GUC-SYNCHRONOUS-STANDBY-NAMES
+ # on causes us to get 100% IO load
+ synchronous_commit = off
+
+ # Default off
+ wal_compression = off
+
+ wal_sync_method = fsync
+
+ # Bad to turn off, may lead to inconcistency
+ # https://www.postgresql.org/docs/13/runtime-config-wal.html#GUC-FULL-PAGE-WRITES
+ # Write full pages to WAL while partial are written to disk, helpful in case of crash, then
+ # partially written pages can be recovered.
+ # Can be disabled if FS does not support partial written pages (such as ZFS)
+ full_page_writes = on
+
+ max_worker_processes=${NUM_CPU}
+ max_parallel_workers=${NUM_CPU}
+ max_parallel_workers_per_gather=10
+ max_connections=500
+
+ max_parallel_maintenance_workers=12
+
+ # out of shared memory
+ max_locks_per_transaction=85
+
+ # (max used =~ work_mem * max_connections)
+ # NOTE: This formula is not completely correct
+ work_mem=2GB
+ maintenance_work_mem=4GB
+
+ max_logical_replication_workers=${NUM_CPU}
+ max_sync_workers_per_subscription=${NUM_CPU}
" > /etc/postgresql/${POSTGRES_VERSION}/main/auditor.conf
echo "