Nothing
# This script tests which algorithm is faster: 2 or 3?
# The intuition is that 3 is much better with few thresholds, but behaves horribly
# as the number increases. However it seems less affected by the number of data points.
# It might change over time so let's make it easier to test again in the future
library(pROC)
library(dplyr)
#devtools::install_github("xrobin/xavamess")
library(xavamess)
library(ggplot2)
library(parallel)
# Number of observations to test
ns <- as.vector(outer(c(1), c(2:7), function(i, j) i * 10^j))
# Controls how many thresholds we get
norm.factors <- as.vector(outer(c(1, 2, 5), c(0:2), function(i, j) i * 10^j))
# Number of cores to execute on
# We want the number of physical cores, remove 1 to be sure
parallel::detectCores() / 2 - 1
# Loop over all those conditions
times.by.alg <- lapply(2:3, function(algorithm) {
times <- lapply(rev(norm.factors), function(norm.factor) {
#times <- autoParLapply(rev(norm.factors), function(norm.factor) {
as.data.frame(t(sapply(ns, function(n) {
print(sprintf("a=%s, norm=%s, n=%s", algorithm, norm.factor, n))
# Get some data
lab <- rbinom(n, 1, 0.5)
d <- round(rnorm(n) * norm.factor)
# How many thresholds do we have?
nthr <- length(unique(d))
if (nthr > 1000 && algorithm == 3) {
# Algorithm 3 is out here anyway, no need to waste time to test it
return(c(n=n, norm.factor=norm.factor, nthr=nthr, algorithm=algorithm, rep(NA, 3)))
}
# Repeat 5 times and take the median time
time <- apply(replicate(5, system.time(roc(lab, d, algorithm=algorithm, levels = c(0, 1), direction = "<"))), 1, median)
return(c(n=n, norm.factor=norm.factor, nthr=nthr, algorithm=algorithm, time[1:3]))
})))
}) # Physical cores, not logical !!!
#}, .maxCores = 3) # Physical cores, not logical !!!
times.df = bind_rows(times)
})
times.by.alg.df <- bind_rows(times.by.alg)
times.by.alg.df$algorithm <- as.factor(times.by.alg.df$algorithm)
# Plot the data
library(ggplot2)
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=algorithm)) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=nthr)) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=log(nthr))) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(nthr, user.self, color=n)) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(nthr, n, color=user.self)) + facet_grid(algorithm ~ .)
# Algorithm 3 is linear with nthr * n?
ggplot(times.by.alg.df) + geom_point(aes(nthr * n, user.self)) + facet_grid(algorithm ~ .)
plot(nthr * n ~ user.self, na.omit(times.by.alg.df %>% filter(algorithm==3)))
# Test algorithm 2
times.by.alg.df2 <- times.by.alg.df %>% filter(algorithm == 2, n > 200)
lm.2 <- lm(user.self ~ n * nthr, times.by.alg.df2)
# nthr gives low, barely significant but negative estimates which don't make sense, so remove it...
lm.2 <- lm(user.self ~ n + 0, times.by.alg.df2)
summary(lm.2)
plot(lm.2)
times.by.alg.df2$predicted.user.self <- predict(lm.2, times.by.alg.df2)
plot(times.by.alg.df2$user.self, times.by.alg.df2$predicted.user.self)
plot(times.by.alg.df2$n, times.by.alg.df2$user.self)
grid <- expand.grid(n=ns, nthr=ns)
grid$prod = grid$n * grid$nthr
grid <- grid[order(grid$n),]
lines(grid$n, predict(lm.2, grid))
# Test algorithm 3
times.by.alg.df3 <- times.by.alg.df %>%
filter(algorithm == 3, n > 200) %>%
mutate(prod = nthr * n)
lm.3 <- lm(user.self ~ n:nthr + 0, times.by.alg.df3)
summary(lm.3)
plot(lm.3)
times.by.alg.df3$predicted.user.self <- predict(lm.3, times.by.alg.df3)
plot(times.by.alg.df3$user.self, times.by.alg.df3$predicted.user.self)
plot(times.by.alg.df3$n * times.by.alg.df3$nthr, times.by.alg.df3$user.self)
grid <- expand.grid(n=ns, nthr=ns)
grid$prod = grid$n * grid$nthr
grid <- grid[order(grid$prod),]
lines(grid$prod, predict(lm.3, grid))
# Predict time on initial data
times.by.alg.df$user.self.predicted.2 <- predict(lm.2, times.by.alg.df)
times.by.alg.df$user.self.predicted.3 <- predict(lm.3, times.by.alg.df)
times.by.alg.df$predicted.best <- ifelse(times.by.alg.df$user.self.predicted.3 < times.by.alg.df$user.self.predicted.2, 3, 2)
ggplot(times.by.alg.df) + geom_point(aes(user.self.predicted.2, user.self.predicted.3, color=n))+ scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(user.self.predicted.2, user.self.predicted.3, color=predicted.best))+ scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, nthr, color=predicted.best))+ scale_x_log10() + scale_y_log10()
### Final formula:
# Algorithm 2: user.self = 2.959e-07 * n
# Algorithm 3: user.self = 5.378e-09 * n * nthr
# Reduction:
# 2.959e-07 * n = 5.378e-09 * n * nthr
# 2.959e-07 / 5.378e-09 = nthr
# 55 = nthr
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.