Figure 3 - LOL Paper

require(lolR)
require(ggplot2)
require(latex2exp)
require(MASS)
require(gridExtra)
require(data.table)
require(reshape2)
require(R.matlab)
require(grid)
require(plyr)
classifier.alg = "lda"
# compute the cutoff for the particular trial to get an approximate elbow
# by computing the smallest r with an associated lhat within 5%
# of the global minimum lhat
compute_cutoff <- function(rs, lhats, t=0.05) {
  sr.ix <- sort(rs, decreasing=FALSE, index.return=TRUE)$ix
  # compute minimum value
  min.lhat <- min(lhats)
  # compute minimum value + 5%
  lhat.thresh <- (1 + t)*min.lhat
  # find which indices are all below this
  lhat.below <- which(lhats <= lhat.thresh)
  rs.below <- rs[lhat.below]; lhats.below <- lhats[lhat.below]
  tmin.ix <- min(rs.below, index.return=TRUE)
  return(list(r=rs.below[tmin.ix], lhat=lhats.below[tmin.ix]))
}

w=.8
h=.2
sim_cov_plot <- function(Sigmas, mus, priors, title="", yl="Dimension", xl="Dimension", ndim=10,
                         nbreaks=4, legend.name=TeX("")) {
  Sigma <- lolR:::lol.mvr(Sigmas, mus, priors)
  Sigma <- Sigma[1:ndim, 1:ndim]  # subset
  Sigma <- (Sigma - min(Sigma))/(max(Sigma) - min(Sigma))
  labs <- c(1, 10)
  sdat <- melt(Sigma)
  plot_cov <- ggplot(sdat, aes(x=Var1, y=Var2, fill=value)) +
    geom_tile() +
    ggtitle(title) +
    xlab(xl) +
    ylab(yl) +
    theme_bw() +
    scale_x_continuous(breaks=labs) +
    scale_y_reverse(breaks=labs) +
    theme(legend.position="bottom") +#, axis.title=element_text(size=14)) +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_fill_gradientn(name=legend.name, colours=c("#fcfbfd", "#9e9ac8", "#3f007d"),
                         limits=c(0, 1), breaks=c(0.0, 0.5, 1.0))#,
                         #guide=guide_colorbar(title.position="top", title.hjust = .5, barheight=.75))
}

mcols <- c("#808080", "#EE7600", "#0e3ec1","#469990") #"#8B4513")
names(mcols) <- c("outlier", "1", "2", "3")
sim_mean_plot <- function(mus, title="", ylab="Magnitude", xlab="Dimension", ndim=10, nbreaks=4) {
  dat <- data.frame(mus[1:ndim,])
  dat <- cbind(data.frame(1:ndim), dat)
  K <- dim(mus)[2]
  ylabs <- sapply(1:K, function(k) as.character(k))
  colnames(dat) <- c("Dimension", ylabs)
  dat <- melt(dat, id="Dimension")
  xlabs <- c(1, 10)
  colnames(dat) <- c("Dimension", "Class", "Magnitude")
  dat$Magnitude = dat$Magnitude/(max(abs(dat$Magnitude)) + .0001)
  lims <- c(-1, 1)
  breaks= c(-1, 0, 1)
  dat$Class <- factor(dat$Class, levels=c(1, 2, 3))
  plot_mean <- ggplot(dat, aes(x=Dimension, y=Magnitude, color=Class)) +
    geom_line(size=1.2) +
    theme_bw() +
    ggtitle(title) +
    xlab(xlab) +
    ylab(ylab) +
    scale_y_continuous(limits=lims, breaks=breaks) +
    scale_x_continuous(breaks=xlabs) +
    theme(legend.position="bottom") +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5))
}

plot_sim_lhats <- function(data, cols, pt.dat, linestyle, title="", by=10, from=10, ylab="Misclassification Rate",
                           xlab="# Embedded Dimensions", fsize=12) {
  lims <- c(floor(10*min(data$lhat))/10, ceiling(10*max(data$lhat))/10)
  if (unique(data$sim)[1] == "Toeplitz") {
    length.out=4
  } else {
    length.out=3
  }
  breaks = unique(round(seq(from=lims[1], to=lims[2], length.out = length.out), digits=1))
  xlims <- c(min(data$r), max(data$r))
  xbreaks <- seq(from=from, to=xlims[2], by=by)
  plot_sims <- ggplot(data, aes(x=r, y=lhat, linetype=alg, color=alg)) +
    geom_line(size=.95) +
    scale_color_manual(values=cols, guide=guide_legend(ncol=4, byrow=TRUE, override.aes=list(shape=NA)), name="Algorithm") +
    scale_linetype_manual(values=linestyle, guide=guide_legend(ncol=4, byrow=TRUE), name="Algorithm") +
    #geom_point(data=pt.dat, aes(x=r, y=lhat, color=alg), size=2) +
    #geom_line(data=base::subset(data, alg == "CCA"), aes(x=r, y=lhat, group=alg, linetype color=alg), size=.75) +
    #geom_point(data=base::subset(pt.dat, alg == "CCA"), aes(x=r, y=lhat, group=alg, color=alg), size=2) +
    #geom_line(data=base::subset(data, alg != "CCA" & alg != "QOQ"), aes(x=r, y=lhat, group=alg, color=alg), size=.75) +
    #geom_point(data=base::subset(pt.dat, alg != "CCA"), aes(x=r, y=lhat, group=alg, color=alg), size=2) +
    #geom_line(data=base::subset(data, alg == "QOQ"), aes(x=r, y=lhat, group=alg, color=alg), linetype="dashed", size=.75) +
    xlab(xlab) +
    ylab(ylab) +
    ggtitle(title) +
    theme_bw() +
    scale_y_continuous(limits=lims, breaks=breaks) +
    scale_x_continuous(limits=xlims, breaks=xbreaks) +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    theme(legend.position="bottom", axis.title.y=element_text(size=fsize))
  return(plot_sims)
}

sim_twod_scatter <- function(X, Y, title="", xlab="", ylab="", nbreaks=4, outlier=NULL, d=c(1, 2)) {
  reorder <- c()
  for (ylab in unique(Y)) {
    reorder <- c(reorder, sample(1:dim(X[Y == ylab,])[1], size=dim(X[Y==ylab,])[1], replace=FALSE))
  }
  if (!is.null(outlier)) {
    Y[outlier$inlier == FALSE] = "outlier"
  }
  X[, d[1]] <- X[, d[1]]/max(abs(X[, d[1]]))
  X[, d[2]] <- X[, d[2]]/max(abs(X[, d[2]]))
  xmax <- max(abs(X[,d[1]])); xmin <- -xmax
  ymax <- max(abs(X[,d[2]])); ymin <- -ymax
  X <- X[reorder,]; Y <- Y[reorder]
  df.dat <- data.frame(d1=X[, d[1]], d2=X[, d[2]], class=as.factor(Y))
  plot_sims <- ggplot(df.dat, aes(x=d1, y=d2, color=class)) +
    geom_point(alpha=0.5) +
    #xlab(paste("Dimension", d[1])) +
    #ylab(paste("Dimension", d[2])) +
    xlab(xlab) +
    ylab(ylab) +
    ggtitle(title) +
    theme_bw() +
    xlim(xmin, xmax) +
    ylim(ymin, ymax) +
    theme(legend.position="bottom") +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5))
  return(plot_sims)
}

sim_nd_scatter <- function(X, Y, title="", ylab="Magnitude", xlab="Dimension", ndim=10, nbreaks=4) {
  reorder <- c()
  for (ylab in unique(Y)) {
    reorder <- c(reorder, sample(1:dim(X[Y == ylab,])[1], size=dim(X[Y==ylab,])[1]/4, replace=FALSE))
  }
  X <- X[reorder,]; Y <- Y[reorder]
  xmax <- max(abs(X)); xmin <- -xmax
  df.dat <- melt(X)
  names(df.dat) <- c("Point", "Dimension", "Magnitude")
  df.dat$class <- as.factor(Y[df.dat$Point])
  plot_sims <- ggplot(df.dat, aes(x=Dimension, y=Magnitude, color=class)) +
    geom_point(alpha=0.5) +
    xlab(xlab) +
    ylab(ylab) +
    ylim(xmin, xmax) +
    ggtitle(title) +
    theme_bw() +
    theme(legend.position="bottom") +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5))
  return(plot_sims)
}


sim_d_fill <- function(mus, Sigmas, outlier=NULL, title="", ylab="Magnitude", number="", xlab="Dimension", ndim=10, nbreaks=4) {
  mu.dat <- melt(mus)
  d <- dim(mus)[1]
  names(mu.dat) <- c("Dimension", "Class", "Magnitude")
  mu.dat$std <- NaN
  for (i in 1:dim(mus)[2]) {
    mu.dat$std[mu.dat$Class == i] = sqrt(diag(Sigmas[,,i]))[mu.dat[mu.dat$Class == i,]$Dimension]
  }
  if (!is.null(outlier)) {
    outlier.dat <- data.frame(Dimension=c(), Class=c(), Magnitude=c(), std=c())
    for (i in 1:dim(outlier$mu.outlier)[2]) {
      outlier.dat <- rbind(outlier.dat, data.frame(Dimension=1:d, Class="outlier", Magnitude=outlier$mu.outlier[,i],
                                        std=sqrt(diag(outlier$sigma.outlier[,,i]))))
    }
    outlier.dat$Magnitude = 0
    mu.dat <- rbind(mu.dat, outlier.dat)
  }
  mu.dat$Class <- factor(mu.dat$Class, names(mcols))
  rescale <- max(abs(mu.dat$Magnitude) + mu.dat$std)
  mu.dat$Magnitude <- mu.dat$Magnitude/rescale
  mu.dat$std <- mu.dat$std/rescale
  plot_sims <- ggplot(mu.dat, aes(x=Dimension, y=Magnitude)) +
    geom_ribbon(alpha=0.3, aes(fill=Class, ymin=Magnitude + std, ymax=Magnitude - std)) +
    geom_line(aes(color=Class), size=1.5, alpha=0.9) +
    xlab(xlab) +
    # ylab(ylab) +
    ylab(paste(number, title, sep=" ")) +
    ggtitle(title) +
    theme_bw() +
    theme(legend.position="bottom") +
    scale_y_continuous(breaks=c(-1, 0, 1)) +
    theme(plot.margin = unit(c(h,w,h,h), "cm")) +
    scale_color_manual(values=mcols) +#, guide=guide_legend(title.position="top", title.hjust = .5))
    scale_fill_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5))
  return(plot_sims)
}

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

The below code will produce the required data, which runs LOL, cPCA, PCA, and LR-CCA at the desired simulation settings. Note that this function will multi-thread natively, and took approximately 7 hours to run on a 96 core machine with $\frac{7}{8}$ of the cores active.

source('./sims_driver.R')

Borrowing results from an earlier matlab implementation:

toep <- readMat('./data/sims/toeplitz.mat')
tr2 <- readMat('./data/sims/rtrunk.mat')
tr3 <- readMat('./data/sims/3trunk.mat')
ft <- readMat('./data/sims/fat_tails.mat')
qd <- readMat('./data/sims/r2toeplitz.mat')

maxr <- c(90, 30, 30, 30, 30)
minr <- 0
mats <- list(toep, tr2, tr3, ft, qd)
sim_name <- c("Toeplitz", "Trunk-2", "Trunk-3", "Fat-Tails (D=1000)", "QDA")

interest <- list(c("ROAD"), c("ROAD"), c("LASSO"), c("ROAD"), c("ROAD"))
key <- c("ROAD", "lasso")
names(key) <- c("ROAD", "LASSO")


resultsm <- data.frame(sim=c(), iter=c(), alg=c(), r=c(), lhat=c())

for (k in 1:length(mats)) { 
  dat <- mats[[k]]
  desired_r <- 1:maxr[k]
  for (i in 1:length(dat$ks)) {  # i encodes simulation iteration
    for (j in length(interest[[k]])) {
      algname <- key[interest[[k]][j]]
      algid <- which(dimnames(dat$ks[[i]][[1]])[[1]] == algname)
      rs <- dat$ks[[i]][[1]][algid,,1][[algname]]
      algid <- which(dimnames(dat$Lhat)[[1]] == algname)
      lhats <- dat$Lhat[algid,,][[i]]
      lhat_adjust <- spline(rs, lhats, xout=desired_r, method='fmm', ties=mean)
      resultsm <- rbind(resultsm, data.frame(sim=sim_name[k], iter=i, alg=interest[[k]][j],
                                             r=lhat_adjust$x, lhat=lhat_adjust$y))
    }
  }
}

First, we prepare the plots of subsets of the mean and covariance matrices:

# run the simulations once to obtain some basic visualizations
n=750
# the simulations to call themselves
sims <- list(lol.sims.rtrunk, lol.sims.rev_rtrunk, lol.sims.cross)
maxr <- c(30, 30, 30)
ds <- c(100, 100, 100)
# additional arguments for each simulation scenario
opt_args <- list(list(K=3), list(robust=n*.4, maxvar.outlier=1000), list())
#sim_names = c("Trunk-2", "Toeplitz", "Trunk-3", "Fat-Tails (D=1000)", "Robust", "Cross")
#sim_titles = c("(A)", "(B)", "(C)", "(D)", "(E)", "(F)")
sim_names = c("Trunk-3", "Robust", "Cross")
sim_titles = c("(A)", "(B)", "(C)")
ndim <- c(10, 10, 10, 10, 10, 10)
sim_min <- c(10, 10, 10)
by <- c(10, 10, 10)

cov_plots <- list()
mean_plots <- list()
two.scatter <- list()
sim.fill <- list()
counter <- 1
dims <- list(c(1, 2), c(2, 50), c(50, 51))
sim.dims <- list()
for (i in 1:length(sims)) {
  simn <- do.call(sims[[i]], c(list(n, ds[i]), opt_args[[i]]))
  cov_plots[[counter]] <- sim_cov_plot(simn$Sigmas, simn$mus, simn$priors, title=sim_names[i])
  mean_plots[[counter]] <- sim_mean_plot(simn$mus, title=sim_names[i], ndim=ndim[i])
  sim.fill[[counter]] <- sim_d_fill(simn$mus, simn$Sigmas, outlier=simn$robust, title=sim_names[i], number=sim_titles[i])
  d <- dims[[i]]
  sim.dims[[counter]] <- sim_twod_scatter(simn$X, simn$Y, title=paste(sim_titles[i], sim_names[i]), outlier=simn$robust, d=d)
  counter <- counter + 1
}

Next, we aggregate over the respective iterations, and subset plots for each function:

# read the results in
results <- readRDS(paste('../data/sims/lol_sims_', classifier.alg, '.rds', sep=""))
#results <- results$overall
nan.mean <- function(x) mean(x, na.rm=TRUE)
results.means <- aggregate(lhat ~ sim + alg + r + lhat, data = results, FUN = nan.mean)

algs <-  c("LOL", "RLOL", "QOQ", "PLS", "CCA", "LDA", "PCA", "RP", "ROAD", "LASSO")
acols <- c("#f41711", "#f41711", "#f41711", "#94d6c9", "#87c6cc", "#99b4c6", "#020202", "#666666", "#666666", "#666666")
linestyle <- c(1,3,3,1,3,1,1,3,1)
linestyle <- c("solid", "dotted", "dashed", "dotted", "solid", "dotted", "solid", "solid", "solid", "dotted")
names(acols) <- algs
names(linestyle) <- algs
sim_plots <- list()
results.means$alg <- revalue(results.means$alg, c("cPCA"="LDA"))
results.means$alg <- revalue(results.means$alg, c("LRLDA"="LDA"))
results.means$alg <- revalue(results.means$alg, c("QOL"="QOQ"))
results.means$alg <- revalue(results.means$alg, c("LOL"="LOL"))
results.means$alg <- revalue(results.means$alg, c("LDA"="LDA"))


counter <- 1

for (i in 1:length(sim_names)) {
  sim <- sim_names[i]
  data_sub <- results.means[results.means$sim == sim & results.means$alg %in% algs,]
  pt.dat <- data.frame(x=c(), y=c())
  for (alg in unique(data_sub$alg)) {
    pt <- compute_cutoff(data_sub[data_sub$alg == alg,]$r, data_sub[data_sub$alg == alg,]$lhat)
    pt.dat <- rbind(pt.dat, data.frame(r=pt$r, lhat=pt$lhat, alg=alg))
  }
  if (i == 2) {
    pt.dat <- pt.dat[pt.dat$alg != "QOQ",]
    data_sub <- data_sub[data_sub$alg != "QOQ",]
  } else if (i == 3) {
    pt.dat <- pt.dat[pt.dat$alg != "RLOL",]
    data_sub <- data_sub[data_sub$alg != "RLOL",]
  } else {
    pt.dat <- pt.dat[pt.dat$alg != "RLOL" & pt.dat$alg != "QOQ",]
    data_sub <- data_sub[data_sub$alg != "RLOL" & data_sub$alg != "QOQ",]
  }
  sim_plots[[counter]] <- plot_sim_lhats(data_sub, acols, pt.dat, linestyle, ylab=paste(sim_titles[i], sim),
                                         from=sim_min[i], by=by[i])
  counter <- counter + 1
}

We merge and combine the plots:

nsim <- length(sim_names)
fsize=12
sim_leg.plot <- ggplot(data=subset(results.means, !(alg  %in% c("ROAD", "LASSO"))), aes(x=r, y=lhat, color=alg, linetype=alg)) +
  geom_line(size=1.2) +
  scale_color_manual(values=acols, guide=guide_legend(ncol=2, byrow=FALSE), name="Algorithm") +
  scale_linetype_manual(values=linestyle, guide=guide_legend(ncol=2, byrow=FALSE), name="Algorithm") +
  theme(legend.position="bottom", axis.title.y=element_text(size=fsize)) +
  theme_bw()

sim_leg <- g_legend(sim_leg.plot)
cov_leg <- g_legend(cov_plots[[1]])
mean_leg <- g_legend(mean_plots[[3]])

dummydat <- data.frame(Class=sample(names(mcols), size=50, replace=TRUE), Dimension=1:50, Magnitude=0, std=2)
simfill_leg.plot <- ggplot(dummydat, aes(x=Dimension, y=Magnitude, color=Class, fill=Class)) +
  geom_line(size=1.5) +
  geom_ribbon(aes(ymin=Magnitude -std, ymax=Magnitude + std), alpha=0.3) +
  scale_color_manual(values=mcols, guide=guide_legend(nrow=4, byrow=TRUE)) +
  scale_fill_manual(values=mcols, guide=guide_legend(nrow=4, byrow=TRUE))

simfill_leg <- g_legend(simfill_leg.plot)

# remove the legends from the plots
sim_plots <- sapply(1:length(sim_plots), function(j) {
    resp <- sim_plots[[j]] + ggtitle("") +theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("") + ylab("") +
        theme(axis.text.x=element_text(color="#FFFFFF"))
    } else {
      resp <- resp + ylab("Error")
    }
    return(resp)
  }, simplify=FALSE)

mean_plots <- sapply(1:length(mean_plots), function(j) {
    resp <- mean_plots[[j]] + ggtitle("") + theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("") + ylab("")
    }
    return(resp)
  }, simplify=FALSE)

cov_plots <- sapply(1:length(cov_plots), function(j) {
    resp <- cov_plots[[j]] + ggtitle("") + theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("") + ylab("")
    }
    return(resp)
  }, simplify=FALSE)

sim.fill <- sapply(1:length(sim.fill), function(j) {
    resp <- sim.fill[[j]] + ggtitle("") + theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 1) {
      resp <- resp + xlab("") +# ylab("") +
        theme(axis.text.x=element_text(color="#FFFFFF"),
              axis.text.y=element_text(color="#FFFFFF"))
    }
    return(resp)
  }, simplify=FALSE)

sim.dims <- sapply(1:length(sim.dims), function(j) {
    resp <- sim.dims[[j]] + theme(legend.position=NaN)
    # remove the ylabel of only the non-left most columns
    if (j != 0) {
      resp <- resp + xlab("") + ylab("")
    } else {
      resp <- resp + xlab("") + ylab("")
    }
    return(resp)
  }, simplify=FALSE)

tfonts = 14
grid_sim <- grid.arrange(grid.arrange(grobs=sim_plots, ncol=nsim), sim_leg, nrow=1, widths=c(.95, .18)
                         )
grid_mean <- grid.arrange(grid.arrange(grobs=mean_plots, ncol=nsim), mean_leg, nrow=2, heights=c(.95, .09),
                         top=textGrob("Means\n(First 10 Dimensions)", gp=gpar(fontsize=tfonts, face="bold")))
grid_cov <- grid.arrange(grid.arrange(grobs=cov_plots, ncol=nsim), cov_leg, nrow=2, heights=c(.95, .09),
                         top=textGrob("Covariances\n(First 10 Dimensions)", gp=gpar(fontsize=tfonts, face="bold")))
grid_simfill <- grid.arrange(grid.arrange(grobs=sim.fill, ncol=nsim), simfill_leg, nrow=2, heights=c(.95, .09),
                         top=textGrob("Generative Model\n(p=100)", gp=gpar(fontsize=tfonts, face="bold")))
grid_simdim <- grid.arrange(grid.arrange(grobs=sim.dims, ncol=nsim), simfill_leg, nrow=1, widths=c(.95, .18)
                            )

We combine and plot:

grid.arrange(grid_simdim, grid_sim, nrow=2, heights=c(0.35, 0.35))


neurodata/lol documentation built on March 3, 2021, 1:46 a.m.