diff options
author | Andreas Madsen <amwebdk@gmail.com> | 2018-01-25 15:33:57 +0100 |
---|---|---|
committer | Joyee Cheung <joyeec9h3@gmail.com> | 2018-01-29 13:26:34 +0800 |
commit | 368517c0dc2576e1f8f315e53d0cfebb22acf9e2 (patch) | |
tree | eff7d3dd5509233411408540f45d6d207ace587e /benchmark | |
parent | 94e36f1f3160fc318114496288aa11b7a4c6dd3a (diff) | |
download | android-node-v8-368517c0dc2576e1f8f315e53d0cfebb22acf9e2.tar.gz android-node-v8-368517c0dc2576e1f8f315e53d0cfebb22acf9e2.tar.bz2 android-node-v8-368517c0dc2576e1f8f315e53d0cfebb22acf9e2.zip |
benchmark: make compare.R easier to understand
PR-URL: https://github.com/nodejs/node/pull/18373
Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de>
Diffstat (limited to 'benchmark')
-rw-r--r-- | benchmark/compare.R | 61 |
1 files changed, 49 insertions, 12 deletions
diff --git a/benchmark/compare.R b/benchmark/compare.R index 5085f4ea73..1527d680c3 100644 --- a/benchmark/compare.R +++ b/benchmark/compare.R @@ -35,6 +35,21 @@ if (!is.null(plot.filename)) { ggsave(plot.filename, p); } +# computes the shared standard error, as used in the welch t-test +welch.sd = function (old.rate, new.rate) { + old.se.squared = var(old.rate) / length(old.rate) + new.se.squared = var(new.rate) / length(new.rate) + return(sqrt(old.se.squared + new.se.squared)) +} + +# calculate the improvement confidence interval. The improvement is calculated +# by dividing by old.mu and not new.mu, because old.mu is what the mean +# improvement is calculated relative to. +confidence.interval = function (shared.se, old.mu, w, risk) { + interval = qt(1 - (risk / 2), w$parameter) * shared.se; + return(sprintf("±%.2f%%", (interval / old.mu) * 100)) +} + # Print a table with results statistics = ddply(dat, "name", function(subdat) { old.rate = subset(subdat, binary == "old")$rate; @@ -45,33 +60,42 @@ statistics = ddply(dat, "name", function(subdat) { new.mu = mean(new.rate); improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100)); - p.value = NA; - confidence = 'NA'; + r = list( + confidence = "NA", + improvement = improvement, + "accuracy (*)" = "NA", + "(**)" = "NA", + "(***)" = "NA" + ); + # Check if there is enough data to calculate the calculate the p-value if (length(old.rate) > 1 && length(new.rate) > 1) { # Perform a statistics test to see of there actually is a difference in # performance. w = t.test(rate ~ binary, data=subdat); - p.value = w$p.value; + shared.se = welch.sd(old.rate, new.rate) # Add user friendly stars to the table. There should be at least one star # before you can say that there is an improvement. confidence = ''; - if (p.value < 0.001) { + if (w$p.value < 0.001) { confidence = '***'; - } else if (p.value < 0.01) { + } else if (w$p.value < 0.01) { confidence = '**'; - } else if (p.value < 0.05) { + } else if (w$p.value < 0.05) { confidence = '*'; } + + r = list( + confidence = confidence, + improvement = improvement, + "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05), + "(**)" = confidence.interval(shared.se, old.mu, w, 0.01), + "(***)" = confidence.interval(shared.se, old.mu, w, 0.001) + ); } - r = list( - improvement = improvement, - confidence = confidence, - p.value = p.value - ); - return(data.frame(r)); + return(data.frame(r, check.names=FALSE)); }); @@ -81,3 +105,16 @@ statistics$name = NULL; options(width = 200); print(statistics); +cat("\n") +cat(sprintf( +"Be aware that when doing many comparisions the risk of a false-positive +result increases. In this case there are %d comparisions, you can thus +expect the following amount of false-positive results: + %.2f false positives, when considering a 5%% risk acceptance (*, **, ***), + %.2f false positives, when considering a 1%% risk acceptance (**, ***), + %.2f false positives, when considering a 0.1%% risk acceptance (***) +", +nrow(statistics), +nrow(statistics) * 0.05, +nrow(statistics) * 0.01, +nrow(statistics) * 0.001)) |