In neurodata/fselect: Linear Optimal Low-Rank Projection

Figure 3 - LOL Paper

require(lolR)
require(ggplot2)
require(latex2exp)
require(MASS)
require(gridExtra)
require(data.table)
require(reshape2)
require(R.matlab)
require(grid)
require(plyr)
require(gtable)
require(tidyverse)
require(ggplotify)
classifier.alg = "lda"
# compute the cutoff for the particular trial to get an approximate elbow
# by computing the smallest r with an associated lhat within 5%
# of the global minimum lhat

w=.8
h=.2

mcols <- c("#808080", "#EE7600", "#0e3ec1","#469990") #"#8B4513")
names(mcols) <- c("outlier", "1", "2", "3")

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

The below code will produce the required data, which runs LOL, cPCA, PCA, and LR-CCA at the desired simulation settings. Note that this function will multi-thread natively, and took approximately 7 hours to run on a 96 core machine with $\frac{7}{8}$ of the cores active.

source('./sims_driver.R')

First, we prepare the plots of subsets of the mean and covariance matrices:

# run the simulations once to obtain some basic visualizations
n=100
# the simulations to call themselves
sims <- list(lol.sims.rtrunk, lol.sims.rev_rtrunk, lol.sims.cross)
ds <- c(100, 100, 100)
# additional arguments for each simulation scenario
opt_args <- list(list(K=3), list(robust=n*.4, maxvar.outlier=1000), list())
sim_names = c("Trunk-3", "Robust", "Cross")
sim_titles = c("(A)", "(B)", "(C)")

counter <- 1
dims <- list(c(1, 2), c(2, 50), c(50, 51))
sim.dims <- list()
sim.dat <- do.call(rbind, lapply(1:length(sims), function(i) {
  simn <- do.call(sims[[i]], c(list(n, ds[i]), opt_args[[i]]))
  if(is.null(simn$robust)) {
    outlier <- FALSE
  } else {
    outlier <- !simn$robust$inlier
  }
  if (sim_names[i] == "Cross") {
    X1 <- simn$X[,ds[i]/2]; X2 <- simn$X[,ds[i]/2 + 1]
  } else {
    X1 <- simn$X[,1]; X2 <- simn$X[,2]
  }
  return(data.frame(X1=X1, X2=X2, Y=simn$Y, sim=sim_names[i], outlier=outlier))
})) %>%
  mutate(sim=recode_factor(sim, "Trunk-3"="(A) Trunk-3", "Robust"="(B) Robust", "Cross"="(C) Cross"),
         sim=factor(sim, levels=c("(A) Trunk-3", "(B) Robust", "(C) Cross"), ordered=TRUE))

sim_plt <- sim.dat %>%
  mutate(Class=ifelse(outlier, "outlier", Y)) %>%
  group_by(sim) %>%
  mutate(X1=(X1-min(X1))/(max(X1) - min(X1)), X2=(X2-min(X2))/(max(X2)-min(X2))) %>%
  ungroup() %>%
  ggplot(aes(x=X1, y=X2, color=Class)) +
  geom_point(size=2) +
  facet_grid(.~sim) +
  scale_color_manual(values=c("1"="blue", "2"="darkorange", "3"="cyan", "outlier"="darkgray")) +
  ylab("Simulated Data") +
  xlab("") +
  theme_bw() +
  theme(text=element_text(size=20), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(),
        axis.text.x=element_blank(), axis.text.y=element_blank())

Next, we aggregate over the respective iterations, and subset plots for each function:

# read the results in
results <- readRDS(paste('../data/sims/lol_sims_', classifier.alg, '.rds', sep="")) %>%
  group_by(sim, alg, r) %>%
  summarize(lhat=mean(lhat)) %>%
  mutate(best.lhat=min(lhat), best.r=r[lhat == best.lhat], lhat.thresh=best.lhat*1.05,
         lhat.beats=lhat <= best.lhat, r.star=min(r[lhat.beats])) %>%
  mutate(alg=factor(recode_factor(alg, "QOL"="QOQ", "LRLDA"="rrLDA"), 
                    levels=c("PCA", "rrLDA", "CCA", "RP", "PLS", "LOL", "QOQ", "RLOL"),
                    ordered=TRUE)) %>%
  filter(!((alg %in% c("RLOL")) & (sim == "Cross"))) %>%
  filter(!((alg %in% c("QOQ")) & (sim %in% c("Trunk-3", "Robust")))) %>%
  mutate(sim=recode_factor(sim, "Trunk-3"="(A) Trunk-3", "Robust"="(B) Robust", "Cross"="(C) Cross"),
         sim=factor(sim, levels=c("(A) Trunk-3", "(B) Robust", "(C) Cross"), ordered=TRUE))

results.star <- results %>% filter(r == r.star)

algs <-  c("LOL", "RLOL", "QOQ", "PLS", "CCA", "rrLDA", "PCA", "RP", "ROAD", "LASSO")
acols <- c("#f41711", "#f41711", "#f41711", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#a65628", "#99b4c6", "#666666")
linestyle <- c(1,2,3,1,1,1,1,1,1, 1)
names(acols) <- algs
names(linestyle) <- algs

grid_sim <- results %>%
  ggplot(aes(x=r, y=lhat, color=alg)) +
  geom_line(aes(linetype=alg), size=1.2) +
  geom_point(data=results.star, size=4, shape=18) +
  scale_linetype_manual(values=linestyle, name="Algorithm") +
  scale_color_manual(values=acols, name="Algorithm") +
  facet_grid(.~sim) +
  xlab("Number of Embedding Dimensions") +
  ylab("Error") +
  guides(color=guide_legend(ncol=2), linetype=guide_legend(ncol=2)) +
  theme_bw() +
  theme(text=element_text(size=20), strip.text.x=element_blank(),
        strip.background=element_blank()) 

lhat_plt <- as.ggplot(gtable_filter(ggplotGrob(grid_sim), "axis-b-[2,3]", trim=FALSE, invert=TRUE))

We combine and plot:

grid.arrange(sim_plt, lhat_plt, nrow=2, heights=c(0.35, 0.35))

Multiclass Plot

plt.multiclass <- readRDS('../data/sims/lol_results_khump.rds') %>%
  group_by(K, r, alg) %>%
  summarize(lhat=mean(lhat)) %>%
  mutate(lhat.chance=1 - 1/K,
         K=paste0("K=", K),
         kappa=(lhat.chance - lhat)/(lhat.chance),
         K=factor(K, levels=c("K=2", "K=4", "K=6", "K=8", "K=10"), ordered=TRUE)) %>%
  mutate(alg=recode_factor(alg, "LRLDA"="rrLDA")) %>%
  ggplot(aes(x=r, y=kappa, color=alg)) +
    geom_line(size=1.2) +
    facet_grid(.~K) +
    xlab("Number of Embedding Dimensions") +
    ylab("Cohen's Kappa") +
    ggtitle("Performance of Embedding Strategies on Multiclass Problems") +
    theme(font=element_text(size=20)) +
    theme_bw() +
  scale_linetype_manual(values=linestyle, name="Algorithm") +
  scale_color_manual(values=acols, name="Algorithm") +
  theme(text=element_text(size=20))


plt.multiclass <- as.ggplot(gtable_filter(ggplotGrob(plt.multiclass), "axis-b-[2,3,4,5]", trim=FALSE, invert=TRUE))
plt.multiclass