Figure 3 - LOL Paper

require(lolR)
require(ggplot2)
require(latex2exp)
require(MASS)
require(gridExtra)
require(data.table)
require(reshape2)
require(R.matlab)
require(grid)
require(plyr)

# compute the cutoff for the particular trial to get an approximate elbow
# by computing the smallest r with an associated lhat within 5%
# of the global minimum lhat
compute_cutoff <- function(rs, lhats, t=0.05) {
  sr.ix <- sort(rs, decreasing=FALSE, index.return=TRUE)$ix
  # compute minimum value
  min.lhat <- min(lhats)
  # compute minimum value + 5%
  lhat.thresh <- (1 + t)*min.lhat
  # find which indices are all below this
  lhat.below <- which(lhats <= lhat.thresh)
  rs.below <- rs[lhat.below]; lhats.below <- lhats[lhat.below]
  tmin.ix <- min(rs.below, index.return=TRUE)
  return(list(r=rs.below[tmin.ix], lhat=lhats.below[tmin.ix]))
}

w=.8
h=.2
sim_cov_plot <- function(Sigmas, mus, priors, title="", yl="Dimension", xl="Dimension", ndim=10,
                         nbreaks=4, legend.name=TeX("")) {
  Sigma <- lol:::lol.mvr(Sigmas, mus, priors)
  Sigma <- Sigma[1:ndim, 1:ndim]  # subset
  Sigma <- (Sigma - min(Sigma))/(max(Sigma) - min(Sigma))
  labs <- c(1, 10)
  sdat <- melt(Sigma)
  plot_cov <- ggplot(sdat, aes(x=Var1, y=Var2, fill=value)) +
    geom_tile() +
    ggtitle(title) +
    xlab(xl) +
    ylab(yl) +
    theme_bw() +
    scale_x_continuous(breaks=labs) +
    scale_y_reverse(breaks=labs) +
    theme(legend.position="bottom") +#, axis.title=element_text(size=14)) +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_fill_gradientn(name=legend.name, colours=c("#fcfbfd", "#9e9ac8", "#3f007d"),
                         limits=c(0, 1), breaks=c(0.0, 0.5, 1.0))#,
                         #guide=guide_colorbar(title.position="top", title.hjust = .5, barheight=.75))
}

mcols <- c("#bdbdbd", "#737373", "#252525")
names(mcols) <- c(1,2,3)
sim_mean_plot <- function(mus, title="", ylab="Magnitude", xlab="Dimension", ndim=10, nbreaks=4) {
  dat <- data.frame(mus[1:ndim,])
  dat <- cbind(data.frame(1:ndim), dat)
  K <- dim(mus)[2]
  ylabs <- sapply(1:K, function(k) as.character(k))
  colnames(dat) <- c("Dimension", ylabs)
  dat <- melt(dat, id="Dimension")
  xlabs <- c(1, 10)
  colnames(dat) <- c("Dimension", "Class", "Magnitude")
  dat$Magnitude = dat$Magnitude/max(abs(dat$Magnitude))
  lims <- c(-1, 1)
  breaks= c(-1, 0, 1)
  dat$Class <- factor(dat$Class, levels=c(1, 2, 3))
  plot_mean <- ggplot(dat, aes(x=Dimension, y=Magnitude, color=Class)) +
    geom_line(size=1.2) +
    theme_bw() +
    ggtitle(title) +
    xlab(xlab) +
    ylab(ylab) +
    scale_y_continuous(limits=lims, breaks=breaks) +
    scale_x_continuous(breaks=xlabs) +
    theme(legend.position="bottom") +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5))
}


plot_sim_lhats <- function(data, cols, pt.dat, linetype, title="", by=10, from=10, ylab=TeX("$\\hat{L}$"),
                           xlab="Embedded Dimensions", fsize=12) {
  lims <- c(floor(10*min(data$lhat))/10, ceiling(10*max(data$lhat))/10)
  if (unique(data$sim)[1] == "Toeplitz") {
    length.out=4
  } else {
    length.out=3
  }
  breaks = unique(round(seq(from=lims[1], to=lims[2], length.out = length.out), digits=1))
  xlims <- c(min(data$r), max(data$r))
  xbreaks <- seq(from=from, to=xlims[2], by=by)
  plot_sims <- ggplot(data, aes(x=r, y=lhat, linetype=alg, color=alg)) +
    geom_line(size=.95) +
    scale_color_manual(values=cols, limits=names(cols),
                       guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
    scale_linetype_manual(values=linetype, limits=names(cols),
                       guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
    geom_point(data=pt.dat, aes(x=r, y=lhat, linetype=alg, color=alg), size=2) +
    #geom_line(data=base::subset(data, alg == "CCA"), aes(x=r, y=lhat, group=alg, linetype color=alg), size=.75) +
    #geom_point(data=base::subset(pt.dat, alg == "CCA"), aes(x=r, y=lhat, group=alg, color=alg), size=2) +
    #geom_line(data=base::subset(data, alg != "CCA" & alg != "QOQ"), aes(x=r, y=lhat, group=alg, color=alg), size=.75) +
    #geom_point(data=base::subset(pt.dat, alg != "CCA"), aes(x=r, y=lhat, group=alg, color=alg), size=2) +
    #geom_line(data=base::subset(data, alg == "QOQ"), aes(x=r, y=lhat, group=alg, color=alg), linetype="dashed", size=.75) +
    xlab(xlab) +
    ylab(ylab) +
    ggtitle(title) +
    theme_bw() +
    scale_y_continuous(limits=lims, breaks=breaks) +
    scale_x_continuous(limits=xlims, breaks=xbreaks) +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    theme(legend.position="bottom", axis.title.y=element_text(size=fsize))
  return(plot_sims)
}

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

The below code will produce the required data, which runs LOL, cPCA, PCA, and LR-CCA at the desired simulation settings. Note that this function will multi-thread natively, and took approximately 7 hours to run on a 96 core machine with $\frac{7}{8}$ of the cores active.

source('./figure_3_driver.R')

Borrowing results from an earlier matlab implementation:

toep <- readMat('./data/fig3/toeplitz.mat')
tr2 <- readMat('./data/fig3/rtrunk.mat')
tr3 <- readMat('./data/fig3/3trunk.mat')
ft <- readMat('./data/fig3/fat_tails.mat')
qd <- readMat('./data/fig3/r2toeplitz.mat')

maxr <- c(90, 30, 30, 30, 30)
minr <- 0
mats <- list(toep, tr2, tr3, ft, qd)
sim_name <- c("Toeplitz", "Trunk-2", "Trunk-3", "Fat-Tails (D=1000)", "QDA")

interest <- list(c("ROAD"), c("ROAD"), c("LASSO"), c("ROAD"), c("ROAD"))
key <- c("ROAD", "lasso")
names(key) <- c("ROAD", "LASSO")


resultsm <- data.frame(sim=c(), iter=c(), alg=c(), r=c(), lhat=c())

for (k in 1:length(mats)) { 
  dat <- mats[[k]]
  desired_r <- 1:maxr[k]
  for (i in 1:length(dat$ks)) {  # i encodes simulation iteration
    for (j in length(interest[[k]])) {
      algname <- key[interest[[k]][j]]
      algid <- which(dimnames(dat$ks[[i]][[1]])[[1]] == algname)
      rs <- dat$ks[[i]][[1]][algid,,1][[algname]]
      algid <- which(dimnames(dat$Lhat)[[1]] == algname)
      lhats <- dat$Lhat[algid,,][[i]]
      lhat_adjust <- spline(rs, lhats, xout=desired_r, method='fmm', ties=mean)
      resultsm <- rbind(resultsm, data.frame(sim=sim_name[k], iter=i, alg=interest[[k]][j],
                                             r=lhat_adjust$x, lhat=lhat_adjust$y))
    }
  }
}

First, we prepare the plots of subsets of the mean and covariance matrices:

# run the simulations once to obtain some basic visualizations
n=100
# the simulations to call themselves
sims <- list(lol.sims.rtrunk, lol.sims.toep, lol.sims.rtrunk, lol.sims.fat_tails, lol.sims.qdtoep)
maxr <- c(30, 90, 30, 30, 30)
ds <- c(100, 100, 100, 1000, 100)
# additional arguments for each simulation scenario
opt_args <- list(list(), list(), list(K=3), list(rotate=TRUE), list())
sim_names = c("Trunk-2", "Toeplitz", "Trunk-3", "Fat-Tails (D=1000)", "QDA")
sim_titles = c("(A)", "(B)", "(C)", "(D)", "(E)")
ndim <- c(10, 10, 10, 10, 10)
sim_min <- c(10, 30, 10, 10, 10)
by <- c(10, 30, 10, 10, 10)

cov_plots <- list()
mean_plots <- list()
counter <- 1

for (i in 1:length(sims)) {
  simn <- do.call(sims[[i]], c(list(n, ds[i]), opt_args[[i]]))
  cov_plots[[counter]] <- sim_cov_plot(simn$Sigmas, simn$mus, simn$priors, title=sim_names[i])
  mean_plots[[counter]] <- sim_mean_plot(simn$mus, title=sim_names[i], ndim=ndim[i])
  counter <- counter + 1
}

Next, we aggregate over the respective iterations, and subset plots for each function:

# read the results in
results <- readRDS('./data/fig3/lol_fig3_lda.rds')
results <- rbind(results$overall[, colnames(results$overall) != 'se'], resultsm)
#results <- results$overall
nan.mean <- function(x) mean(x, na.rm=TRUE)
results.means <- aggregate(lhat ~ sim + alg + r + lhat, data = results, FUN = nan.mean)
algs <-  c("LOL", "QOQ", "ROAD", "LASSO", "PLS", "CCA", "PCA", "LDA")
acols <- c("#00FF00", "#00FF00", "#969696", "#969696", "#969696", "#525252", "#525252", "#525252")
names(acols) <- algs
linestyle <- c("solid", "dashed", "solid", "dashed", "dotted", "solid", "dashed", "dotted")
names(linestyle) <- algs
sim_plots <- list()
results.means$alg <- revalue(results.means$alg, c("cPCA"="LDA"))

results.means$type <- revalue(results.means$alg, linestyle)
results.means$color <- revalue(results.means$alg, acols)
counter <- 1
for (i in 1:length(sim_names)) {
  sim <- sim_names[i]
  data_sub <- results.means[results.means$sim == sim,]
  pt.dat <- data.frame(x=c(), y=c())
  for (alg in unique(data_sub$alg)) {
    pt <- compute_cutoff(data_sub[data_sub$alg == alg,]$r, data_sub[data_sub$alg == alg,]$lhat)
    pt.dat <- rbind(pt.dat, data.frame(r=pt$r, lhat=pt$lhat, alg=alg))
  }
  sim_plots[[counter]] <- plot_sim_lhats(data_sub, acols, pt.dat, linestyle, ylab=paste(sim_titles[i], sim), from=sim_min[i], by=by[i])
  counter <- counter + 1
}

We merge and combine the plots:

nsim <- length(sim_names)
sim_leg <- g_legend(sim_plots[[1]] + guides(colour = guide_legend(override.aes = list(shape = NA)))
)
cov_leg <- g_legend(cov_plots[[1]])
mean_leg <- g_legend(mean_plots[[3]])

# remove the legends from the plots
sim_plots <- sapply(1:length(sim_plots), function(j) {
    resp <- sim_plots[[j]] + ggtitle("") +theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("")
    }
    return(resp)
  }, simplify=FALSE)
mean_plots <- sapply(1:length(mean_plots), function(j) {
    resp <- mean_plots[[j]] + ggtitle("") + theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("") + ylab("")
    }
    return(resp)
  }, simplify=FALSE)

cov_plots <- sapply(1:length(cov_plots), function(j) {
    resp <- cov_plots[[j]] + ggtitle("") + theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("") + ylab("")
    }
    return(resp)
  }, simplify=FALSE)

tfonts = 14
grid_sim <- grid.arrange(grid.arrange(grobs=sim_plots, nrow=nsim), sim_leg, nrow=2, heights=c(.95, .07),
                         top=textGrob("Misclassification Rate\n(D=100, n=100)", gp=gpar(fontsize=tfonts, face="bold")))
grid_mean <- grid.arrange(grid.arrange(grobs=mean_plots, nrow=nsim), mean_leg, nrow=2, heights=c(.95, .07),
                         top=textGrob("Means\n(First 10 Dimensions)", gp=gpar(fontsize=tfonts, face="bold")))
grid_cov <- grid.arrange(grid.arrange(grobs=cov_plots, nrow=nsim), cov_leg, nrow=2, heights=c(.95, .07),
                         top=textGrob("Covariances\n(First 10 Dimensions)", gp=gpar(fontsize=tfonts, face="bold")))

We combine and plot:

grid.arrange(grid_sim, grid_mean, grid_cov, ncol=3, widths=c(0.35, 0.25, 0.2))


neurodata/lol documentation built on March 3, 2021, 1:46 a.m.