# inst/extra/algorithms.speed.test.R In pROC: Display and Analyze ROC Curves

```# This script tests which algorithm is faster: 2 or 3?
# The intuition is that 3 is much better with few thresholds, but behaves horribly
# as the number increases. However it seems less affected by the number of data points.
# It might change over time so let's make it easier to test again in the future

library(pROC)
library(dplyr)
#devtools::install_github("xrobin/xavamess")
library(xavamess)
library(ggplot2)
library(parallel)

# Number of observations to test
ns <- as.vector(outer(c(1), c(2:7), function(i, j) i * 10^j))
# Controls how many thresholds we get
norm.factors <- as.vector(outer(c(1, 2, 5), c(0:2), function(i, j) i * 10^j))
# Number of cores to execute on
# We want the number of physical cores, remove 1 to be sure
parallel::detectCores() / 2 - 1

# Loop over all those conditions
times.by.alg <- lapply(2:3, function(algorithm) {
times <- lapply(rev(norm.factors), function(norm.factor) {
#times <- autoParLapply(rev(norm.factors), function(norm.factor) {
as.data.frame(t(sapply(ns, function(n) {
print(sprintf("a=%s, norm=%s, n=%s", algorithm, norm.factor, n))
# Get some data
lab <- rbinom(n, 1, 0.5)
d <- round(rnorm(n) * norm.factor)
# How many thresholds do we have?
nthr <- length(unique(d))
if (nthr > 1000 && algorithm == 3) {
# Algorithm 3 is out here anyway, no need to waste time to test it
return(c(n=n, norm.factor=norm.factor, nthr=nthr, algorithm=algorithm, rep(NA, 3)))
}
# Repeat 5 times and take the median time
time <- apply(replicate(5,  system.time(roc(lab, d, algorithm=algorithm, levels = c(0, 1), direction = "<"))), 1, median)
return(c(n=n, norm.factor=norm.factor, nthr=nthr, algorithm=algorithm, time[1:3]))
})))
}) # Physical cores, not logical !!!
#}, .maxCores = 3) # Physical cores, not logical !!!
times.df = bind_rows(times)
})
times.by.alg.df <- bind_rows(times.by.alg)
times.by.alg.df\$algorithm <- as.factor(times.by.alg.df\$algorithm)

# Plot the data
library(ggplot2)
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=algorithm)) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=nthr)) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, user.self, color=log(nthr))) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(nthr, user.self, color=n)) + facet_grid(algorithm ~ .) + scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(nthr, n, color=user.self)) + facet_grid(algorithm ~ .)

# Algorithm 3 is linear with nthr * n?
ggplot(times.by.alg.df) + geom_point(aes(nthr * n, user.self)) + facet_grid(algorithm ~ .)
plot(nthr * n ~ user.self, na.omit(times.by.alg.df %>% filter(algorithm==3)))

# Test algorithm 2
times.by.alg.df2 <- times.by.alg.df %>% filter(algorithm == 2, n > 200)
lm.2 <- lm(user.self ~ n * nthr, times.by.alg.df2)
# nthr gives low, barely significant but negative estimates which don't make sense, so remove it...
lm.2 <- lm(user.self ~ n + 0, times.by.alg.df2)
summary(lm.2)
plot(lm.2)

times.by.alg.df2\$predicted.user.self <- predict(lm.2, times.by.alg.df2)
plot(times.by.alg.df2\$user.self, times.by.alg.df2\$predicted.user.self)

plot(times.by.alg.df2\$n, times.by.alg.df2\$user.self)
grid <- expand.grid(n=ns, nthr=ns)
grid\$prod = grid\$n * grid\$nthr
grid <- grid[order(grid\$n),]
lines(grid\$n, predict(lm.2, grid))

# Test algorithm 3
times.by.alg.df3 <- times.by.alg.df %>%
filter(algorithm == 3, n > 200) %>%
mutate(prod = nthr * n)
lm.3 <- lm(user.self ~ n:nthr + 0, times.by.alg.df3)
summary(lm.3)
plot(lm.3)

times.by.alg.df3\$predicted.user.self <- predict(lm.3, times.by.alg.df3)
plot(times.by.alg.df3\$user.self, times.by.alg.df3\$predicted.user.self)

plot(times.by.alg.df3\$n * times.by.alg.df3\$nthr, times.by.alg.df3\$user.self)
grid <- expand.grid(n=ns, nthr=ns)
grid\$prod = grid\$n * grid\$nthr
grid <- grid[order(grid\$prod),]
lines(grid\$prod, predict(lm.3, grid))

# Predict time on initial data
times.by.alg.df\$user.self.predicted.2 <- predict(lm.2, times.by.alg.df)
times.by.alg.df\$user.self.predicted.3 <- predict(lm.3, times.by.alg.df)
times.by.alg.df\$predicted.best <- ifelse(times.by.alg.df\$user.self.predicted.3 < times.by.alg.df\$user.self.predicted.2, 3, 2)
ggplot(times.by.alg.df) + geom_point(aes(user.self.predicted.2, user.self.predicted.3, color=n))+ scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(user.self.predicted.2, user.self.predicted.3, color=predicted.best))+ scale_x_log10() + scale_y_log10()
ggplot(times.by.alg.df) + geom_point(aes(n, nthr, color=predicted.best))+ scale_x_log10() + scale_y_log10()

### Final formula:
# Algorithm 2: user.self = 2.959e-07 * n
# Algorithm 3: user.self = 5.378e-09 * n * nthr
# Reduction:
# 2.959e-07 * n = 5.378e-09 * n * nthr
# 2.959e-07 / 5.378e-09 = nthr
# 55 = nthr
```

## Try the pROC package in your browser

Any scripts or data that you put into this service are public.

pROC documentation built on Nov. 2, 2023, 6:05 p.m.