summaryrefslogtreecommitdiff
path: root/benchmark
diff options
context:
space:
mode:
authorAndreas Madsen <amwebdk@gmail.com>2018-01-25 15:33:57 +0100
committerJoyee Cheung <joyeec9h3@gmail.com>2018-01-29 13:26:34 +0800
commit368517c0dc2576e1f8f315e53d0cfebb22acf9e2 (patch)
treeeff7d3dd5509233411408540f45d6d207ace587e /benchmark
parent94e36f1f3160fc318114496288aa11b7a4c6dd3a (diff)
downloadandroid-node-v8-368517c0dc2576e1f8f315e53d0cfebb22acf9e2.tar.gz
android-node-v8-368517c0dc2576e1f8f315e53d0cfebb22acf9e2.tar.bz2
android-node-v8-368517c0dc2576e1f8f315e53d0cfebb22acf9e2.zip
benchmark: make compare.R easier to understand
PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de>
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/compare.R61
1 files changed, 49 insertions, 12 deletions
diff --git a/benchmark/compare.R b/benchmark/compare.R
index 5085f4ea73..1527d680c3 100644
--- a/benchmark/compare.R
+++ b/benchmark/compare.R
@@ -35,6 +35,21 @@ if (!is.null(plot.filename)) {
ggsave(plot.filename, p);
}
+# computes the shared standard error, as used in the welch t-test
+welch.sd = function (old.rate, new.rate) {
+ old.se.squared = var(old.rate) / length(old.rate)
+ new.se.squared = var(new.rate) / length(new.rate)
+ return(sqrt(old.se.squared + new.se.squared))
+}
+
+# calculate the improvement confidence interval. The improvement is calculated
+# by dividing by old.mu and not new.mu, because old.mu is what the mean
+# improvement is calculated relative to.
+confidence.interval = function (shared.se, old.mu, w, risk) {
+ interval = qt(1 - (risk / 2), w$parameter) * shared.se;
+ return(sprintf("±%.2f%%", (interval / old.mu) * 100))
+}
+
# Print a table with results
statistics = ddply(dat, "name", function(subdat) {
old.rate = subset(subdat, binary == "old")$rate;
@@ -45,33 +60,42 @@ statistics = ddply(dat, "name", function(subdat) {
new.mu = mean(new.rate);
improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));
- p.value = NA;
- confidence = 'NA';
+ r = list(
+ confidence = "NA",
+ improvement = improvement,
+ "accuracy (*)" = "NA",
+ "(**)" = "NA",
+ "(***)" = "NA"
+ );
+
# Check if there is enough data to calculate the calculate the p-value
if (length(old.rate) > 1 && length(new.rate) > 1) {
# Perform a statistics test to see of there actually is a difference in
# performance.
w = t.test(rate ~ binary, data=subdat);
- p.value = w$p.value;
+ shared.se = welch.sd(old.rate, new.rate)
# Add user friendly stars to the table. There should be at least one star
# before you can say that there is an improvement.
confidence = '';
- if (p.value < 0.001) {
+ if (w$p.value < 0.001) {
confidence = '***';
- } else if (p.value < 0.01) {
+ } else if (w$p.value < 0.01) {
confidence = '**';
- } else if (p.value < 0.05) {
+ } else if (w$p.value < 0.05) {
confidence = '*';
}
+
+ r = list(
+ confidence = confidence,
+ improvement = improvement,
+ "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
+ "(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
+ "(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
+ );
}
- r = list(
- improvement = improvement,
- confidence = confidence,
- p.value = p.value
- );
- return(data.frame(r));
+ return(data.frame(r, check.names=FALSE));
});
@@ -81,3 +105,16 @@ statistics$name = NULL;
options(width = 200);
print(statistics);
+cat("\n")
+cat(sprintf(
+"Be aware that when doing many comparisions the risk of a false-positive
+result increases. In this case there are %d comparisions, you can thus
+expect the following amount of false-positive results:
+ %.2f false positives, when considering a 5%% risk acceptance (*, **, ***),
+ %.2f false positives, when considering a 1%% risk acceptance (**, ***),
+ %.2f false positives, when considering a 0.1%% risk acceptance (***)
+",
+nrow(statistics),
+nrow(statistics) * 0.05,
+nrow(statistics) * 0.01,
+nrow(statistics) * 0.001))