# This script tests which algorithm is faster: 2 or 3?
# The intuition is that 3 is much better with few thresholds, but behaves horribly
# as the number increases. However it seems less affected by the number of data points.
# It might change over time so let's make it easier to test again in the future
library(pROC)
library(dplyr)
#devtools::install_github("xrobin/xavamess")
library(xavamess)
library(ggplot2)
library(parallel)
# Number of observations to test
ns <- as.vector(outer(c(1), c(2:7), function(i, j) i * 10^j))
# Controls how many thresholds we get
norm.factors <- as.vector(outer(c(1, 2, 5), c(0:2), function(i, j) i * 10^j))
# Number of cores to execute on
# We want the number of physical cores, remove 1 to be sure
parallel::detectCores() / 2 - 1
# Loop over all those conditions
times.by.alg <- lapply(2:3, function(algorithm) {
times <- lapply(rev(norm.factors), function(norm.factor) {
#times <- autoParLapply(rev(norm.factors), function(norm.factor) {
as.data.frame(t(sapply(ns, function(n) {
print(sprintf("a=%s, norm=%s, n=%s", algorithm, norm.factor, n))
# Get some data
lab <- rbinom(n, 1, 0.5)
d <- round(rnorm(n) * norm.factor)
# How many thresholds do we have?
nthr <- length(unique(d))
if (nthr > 1000 && algorithm == 3) {
# Algorithm 3 is out here anyway, no need to waste time to test it
return(c(n=n, norm.factor=norm.factor, nthr=nthr, algorithm=algorithm, rep(NA, 3)))
}
# Repeat 5 times and take the median time
time <- apply(replicate(5, system.time(roc(lab, d, algorithm=algorithm, levels = c(0, 1), direction = "<"))), 1, median)
return(c(n=n, norm.factor=norm.factor, nthr=nthr, algorithm=algorithm, time[1:3]))
})))
}) # Physical cores, not logical !!!
#}, .maxCores = 3) # Physical cores, not logical !!!
times.df = bind_rows(times)
})
times.by.alg.df <- bind_rows(times.by.alg)
times.by.alg.df$algorithm <- as.factor(times.by.alg.df$algorithm)
# Plot the data
library(ggplot2)
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=algorithm)) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=nthr)) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=log(nthr))) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(nthr, user.self, color=n)) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(nthr, n, color=user.self)) + facet_grid(algorithm ~ .)
# Algorithm 3 is linear with nthr * n?
ggplot(times.by.alg.df) + geom_point(aes(nthr * n, user.self)) + facet_grid(algorithm ~ .)
plot(nthr * n ~ user.self, na.omit(times.by.alg.df %>% filter(algorithm==3)))
# Test algorithm 2
times.by.alg.df2 <- times.by.alg.df %>% filter(algorithm == 2, n > 200)
lm.2 <- lm(user.self ~ n * nthr, times.by.alg.df2)
# nthr gives low, barely significant but negative estimates which don't make sense, so remove it...
lm.2 <- lm(user.self ~ n + 0, times.by.alg.df2)
summary(lm.2)
plot(lm.2)
times.by.alg.df2$predicted.user.self <- predict(lm.2, times.by.alg.df2)
plot(times.by.alg.df2$user.self, times.by.alg.df2$predicted.user.self)
plot(times.by.alg.df2$n, times.by.alg.df2$user.self)
grid <- expand.grid(n=ns, nthr=ns)
grid$prod = grid$n * grid$nthr
grid <- grid[order(grid$n),]
lines(grid$n, predict(lm.2, grid))
# Test algorithm 3
times.by.alg.df3 <- times.by.alg.df %>%
filter(algorithm == 3, n > 200) %>%
mutate(prod = nthr * n)
lm.3 <- lm(user.self ~ n:nthr + 0, times.by.alg.df3)
summary(lm.3)
plot(lm.3)
times.by.alg.df3$predicted.user.self <- predict(lm.3, times.by.alg.df3)
plot(times.by.alg.df3$user.self, times.by.alg.df3$predicted.user.self)
plot(times.by.alg.df3$n * times.by.alg.df3$nthr, times.by.alg.df3$user.self)
grid <- expand.grid(n=ns, nthr=ns)
grid$prod = grid$n * grid$nthr
grid <- grid[order(grid$prod),]
lines(grid$prod, predict(lm.3, grid))
# Predict time on initial data
times.by.alg.df$user.self.predicted.2 <- predict(lm.2, times.by.alg.df)
times.by.alg.df$user.self.predicted.3 <- predict(lm.3, times.by.alg.df)
times.by.alg.df$predicted.best <- ifelse(times.by.alg.df$user.self.predicted.3 < times.by.alg.df$user.self.predicted.2, 3, 2)
ggplot(times.by.alg.df) + geom_point(aes(user.self.predicted.2, user.self.predicted.3, color=n))+ scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(user.self.predicted.2, user.self.predicted.3, color=predicted.best))+ scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, nthr, color=predicted.best))+ scale_x_log10() + scale_y_log10()
### Final formula:
# Algorithm 2: user.self = 2.959e-07 * n
# Algorithm 3: user.self = 5.378e-09 * n * nthr
# Reduction:
# 2.959e-07 * n = 5.378e-09 * n * nthr
# 2.959e-07 / 5.378e-09 = nthr
# 55 = nthr
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.