#!/usr/bin/env Rscript library(ggplot2); library(plyr); # get __dirname and load ./_cli.R args = commandArgs(trailingOnly = F); dirname = dirname(sub("--file=", "", args[grep("--file", args)])); source(paste0(dirname, '/_cli.R'), chdir=T); if (!is.null(args.options$help) || (!is.null(args.options$plot) && args.options$plot == TRUE)) { stop("usage: cat file.csv | Rscript compare.R --help show this message --plot filename save plot to filename"); } plot.filename = args.options$plot; dat = read.csv( file('stdin'), colClasses=c('character', 'character', 'character', 'numeric', 'numeric') ); dat = data.frame(dat); dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration); dat$name = paste0(dat$filename, dat$configuration); # Create a box plot if (!is.null(plot.filename)) { p = ggplot(data=dat); p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary)); p = p + ylab("rate of operations (higher is better)"); p = p + xlab("benchmark"); p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)); ggsave(plot.filename, p); } # computes the shared standard error, as used in the welch t-test welch.sd = function (old.rate, new.rate) { old.se.squared = var(old.rate) / length(old.rate) new.se.squared = var(new.rate) / length(new.rate) return(sqrt(old.se.squared + new.se.squared)) } # calculate the improvement confidence interval. The improvement is calculated # by dividing by old.mu and not new.mu, because old.mu is what the mean # improvement is calculated relative to. confidence.interval = function (shared.se, old.mu, w, risk) { interval = qt(1 - (risk / 2), w$parameter) * shared.se; return(sprintf("±%.2f%%", (interval / old.mu) * 100)) } # Print a table with results statistics = ddply(dat, "name", function(subdat) { old.rate = subset(subdat, binary == "old")$rate; new.rate = subset(subdat, binary == "new")$rate; # Calculate improvement for the "new" binary compared with the "old" binary old.mu = mean(old.rate); new.mu = mean(new.rate); improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100)); r = list( confidence = "NA", improvement = improvement, "accuracy (*)" = "NA", "(**)" = "NA", "(***)" = "NA" ); # Check if there is enough data to calculate the calculate the p-value if (length(old.rate) > 1 && length(new.rate) > 1) { # Perform a statistics test to see of there actually is a difference in # performance. w = t.test(rate ~ binary, data=subdat); shared.se = welch.sd(old.rate, new.rate) # Add user friendly stars to the table. There should be at least one star # before you can say that there is an improvement. confidence = ''; if (w$p.value < 0.001) { confidence = '***'; } else if (w$p.value < 0.01) { confidence = '**'; } else if (w$p.value < 0.05) { confidence = '*'; } r = list( confidence = confidence, improvement = improvement, "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05), "(**)" = confidence.interval(shared.se, old.mu, w, 0.01), "(***)" = confidence.interval(shared.se, old.mu, w, 0.001) ); } return(data.frame(r, check.names=FALSE)); }); # Set the benchmark names as the row.names to left align them in the print row.names(statistics) = statistics$name; statistics$name = NULL; options(width = 200); print(statistics); cat("\n") cat(sprintf( "Be aware that when doing many comparisons the risk of a false-positive result increases. In this case there are %d comparisons, you can thus expect the following amount of false-positive results: %.2f false positives, when considering a 5%% risk acceptance (*, **, ***), %.2f false positives, when considering a 1%% risk acceptance (**, ***), %.2f false positives, when considering a 0.1%% risk acceptance (***) ", nrow(statistics), nrow(statistics) * 0.05, nrow(statistics) * 0.01, nrow(statistics) * 0.001))