In neurodata/fselect: Linear Optimal Low-Rank Projection

In this notebook, we compare the performance of LOL to that of several other linear embedding techniques across the problems from the UCI repository and PMLB repository that can be classified as high-dimensionality, low sample-size (HDLSS) where $d > 100$. The data below was collected with $k=50$ fold validation. Testing sets were rotated across all folds, with the training set comprising the remaining $k-1=49$ folds. As only a handful of problems are low-rank and therefore HDLSS $n < d$, we subsample the training set to be $\textrm{min}(n\frac{k-1}{k}, d-1)$ where $n$ is the number of provided samples, $\frac{k-1}{k} = \frac{49}{50}$ is the fraction of samples for training, and $d$ is the native dimensionality of the data. This ensures that all examples shown below are on HDLSS data. The datasets were run using the real data driver script.

knitr::opts_chunk$set(warning = FALSE)

require(lolR)
require(ggplot2)
require(latex2exp)
require(MASS)
require(gridExtra)
require(data.table)
require(reshape2)
require(R.matlab)
require(grid)
require(plyr)
require(slb)
require(scales)
require(stringr)
require(ggbeeswarm)
classifier.name <- "lda"
opath <- './data/real_data'
repo.name <- "uci"

cmp.alg="LOL"
cmp.algname="lol"

algs <-  c("LOL", "PLS", "CCA", "LRLDA", "PCA", "RP")
acols <- c("#f41711", "#94d6c9", "#87c6cc", "#99b4c6", "#020202", "#5f8793")
linestyle <- c("solid", "dotted", "solid","dotted", "solid", "solid")
names(linestyle) <- algs
names(algs) <- acols
names(acols) <- algs
#shapes <- c(21, 24, 21, 24, 23, 23, 21, 24, 23)
shapes <- c(21, 24, 21, 22, 21, 23)
names(shapes) <- algs
# algs <-  c("RLOL", "RLRLDA", "RPCA")
# acols <- c("#f41711", "#99b4c6", "#020202")
# linestyle <- c("solid", "dotted", "solid")
# names(linestyle) <- algs
# names(algs) <- acols
# names(acols) <- algs
# #shapes <- c(21, 24, 21, 24, 23, 23, 21, 24, 23)
# shapes <- c(21, 22, 21)
# names(shapes) <- algs

# compute the cutoff for the particular trial to get an approximate elbow
# by computing the smallest r with an associated lhat within 5%
# of the global minimum lhat
compute_cutoff <- function(rs, lhats, t=0) {
  rs <- rs[complete.cases(lhats) & complete.cases(rs)]; lhats <- lhats[complete.cases(lhats) & complete.cases(rs)]
  sr.ix <- sort(rs, decreasing=FALSE, index.return=TRUE)$ix
  # compute minimum value
  min.lhat <- min(lhats)
  # compute minimum value + 5%
  lhat.thresh <- (1 + t)*min.lhat
  # find which indices are all below this
  lhat.below <- which(lhats <= lhat.thresh)
  rs.below <- rs[lhat.below]; lhats.below <- lhats[lhat.below]
  tmin.ix <- min(rs.below, index.return=TRUE)
  return(list(r=rs.below[tmin.ix], lhat=lhats.below[tmin.ix]))
}

w=.8
h=.2
plot_sim_lhats <- function(data_sub, data_sub.optimalr, cols, linetype, shape, title="", from=10, ylab=TeX("$\\hat{L}$"),
                           xlab="Embedded Dimensions", fsize=12, length.out=3) {
  lims <- c(floor(100*min(data_sub.optimalr$lhat.alg, na.rm=TRUE))/100, ceiling(100*max(data_sub.optimalr$lhat.alg, na.rm=TRUE))/100)
  if (is.na(sum(lims))) {
    return(ggplot())
  }
  tryCatch({
    breaks = unique(round(seq(from=lims[1], to=lims[2], length.out = length.out), digits=1))
    xlims <- c(min(data_sub$r.alg, na.rm=TRUE), max(data_sub$r.alg, na.rm=TRUE))
    xbreaks <- round(seq(from=from, to=xlims[2], length.out=length.out))
    plot_sims <- ggplot(data_sub, aes(x=r.alg, y=lhat.alg, linetype=alg, shape=alg, fill=alg, color=alg)) +
      stat_summary(size=.95, fun.y="mean", geom="line") +
      #stat_summary(fun.data = mean_cl_normal, geom = "errorbar", fun.args = list(mult = 1)) +
      geom_point(data=data_sub.optimalr, aes(x=r.alg, y=lhat.alg, shape=alg, fill=alg, color=alg), size=2) +
      scale_color_manual(values=cols, limits=names(cols),
                         guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
      scale_fill_manual(values=cols, limits=names(cols),
                        guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
      scale_shape_manual(values=shape, limits=names(cols),
                         guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
      scale_linetype_manual(values=linetype, limits=names(cols),
                         guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
      xlab(xlab) +
      ylab(ylab) +
      ggtitle(title) +
      theme_bw() +
      coord_cartesian(ylim=lims) +
      scale_x_continuous(limits=xlims, breaks=xbreaks) +#, trans=log10_trans()) +
      theme(plot.margin = unit(c(h,w,h,h), "cm")) +
      theme(legend.position="bottom", text=element_text(size=fsize))
    return(plot_sims)
  }, error=function(e) {return(ggplot())})
}

plot_sim_scatter <- function(data.sub.foldwise, cols, shape, title="",
                             ylab=TeX("$\\frac{\\hat{L}_{LOL} - \\hat{L}_{alg}}{\\hat{L}_{chance}}$"), psize=1,
                             xlab="Algorithm", fsize=12) {
  tryCatch({
      plot_sims <- ggplot(dat=subset(data.sub.foldwise, alg != "LOL"), aes(x=alg, y=lhat.norm, color=alg, group=alg)) +
        geom_beeswarm(alpha=0.5, size=psize) +
        xlab("Algorithm") +
        scale_color_manual(values=cols, limits=names(cols),
                           guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
        scale_fill_manual(values=cols, limits=names(cols),
                          guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
        scale_shape_manual(values=shape, limits=names(cols),
                           guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") +
        xlab(xlab) +
        ylab(ylab) +
        ggtitle(title) +
        theme_bw() +
        theme(plot.margin = unit(c(h,w,h,h), "cm")) +
        theme(legend.position="bottom", text=element_text(size=fsize))
      return(plot_sims)
  }, error=function(e) {return(ggplot())})
}



g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

parse_mrn <- function() {
  data <- list(LOL=lol.mrn, PCA=pca.mrn, LRLDA=lrlda.mrn)
  result <- data.frame()
  for (alg in names(data)) {
    colnames(data[[alg]]) <- c(1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
    for (fold in 1:dim(data[[alg]])[1]) {
      result <- rbind(result, data.frame(exp="MRN", alg=alg, xv=40, n=114, ntrain=111, d=500000000, K=2,fold=fold,
                                         r=colnames(data[[alg]]),lhat=as.numeric(data[[alg]][fold,]), repo="neurodata"))
    }
  }
  result <- rbind(result, data.frame(exp="MRN", alg="RandomGuess", xv=40, n=114, ntrain=111, d=500000000, K=2, fold=1:dim(pca.mrn)[1],
                                     r=NaN, lhat=0.5, repo="neurodata"))
  return(result)
}

lol.mrn <- read.csv('./data/real_data/LOL-MRN-40.csv')
pca.mrn <- read.csv('./data/real_data/PCA-LDA-MRN-40.csv')
lrlda.mrn <- read.csv('./data/real_data/RR-LDA-MRN-40.csv')
results.mrn <- parse_mrn()

Results Loading

results <- readRDS(file.path(opath, paste(classifier.name, "_results.rds", sep="")))
results <- rbind(results, results.mrn)
exp_names <- unique(as.character(results$exp))
# filter out bad rows
results <- results[complete.cases(results$lhat) & !(is.infinite(results$lhat)) & complete.cases(results[colnames(results) != "r"]),]
exp_names <- unique(as.character(results$exp))
# make sure columns of interest are numeric
numeric.cols <- c("d", "n", "ntrain", "K", "fold", "lhat", "r")
for (col in numeric.cols) {
  results[[col]] <- as.numeric(results[[col]])
  if (col == "ntrain") {  # make sure that ntrain is constant across experiments, due to situation where one fold has fewer examples than another
    for (exp in exp_names) {
      results[results$exp == exp,]$ntrain = median(results[results$exp == exp,]$ntrain, na.rm=TRUE)
    }    
  }
}
results <- results[results$d > 100 & results$ntrain/results$K > 10,]  # subset out datasets with low d and incredibly low sample size:class ratio
# filter out duplicate experiments
results <- results[!(results$exp %in% c("splice", "promoters", "molecular-biology_promoters", "rorb")),]
nan.mean <- function(x) {mean(x[!is.infinite(x)], na.rm=TRUE)}
results.means <- aggregate(lhat ~ exp + alg + r + d + n + ntrain + K, data = results, FUN = nan.mean)
random.results <- aggregate(lhat ~ exp + alg, data=subset(results, alg == "RandomGuess"), FUN=mean)

Analysis

WRT LOL

#nan.median <- function(x) median(x, na.rm=TRUE)  
lhat.mean <- 1
k.fold <- length(unique(results$fold))
#results.medians <- aggregate(lhat ~ exp + alg + r + d + n + K, data = results, FUN = nan.median)
results.optimalr <- data.frame()
results.overall <- data.frame()
for (i in 1:length(exp_names)) {
  r.max <- max(results.means[results.means$exp == as.character(exp_names[i]),]$r)
  ss.chance <- results[results$alg == "RandomGuess" & results$exp == as.character(exp_names[i]),]
  colnames(ss.chance)[colnames(ss.chance) == "lhat"] = "lhat.chance"
  ss.chance <- ss.chance[, colnames(ss.chance) != "r"]
  for (j in 1:length(algs)) {
    tryCatch({
      alg <- as.character(algs[j])
      ss <- results[results$exp == exp_names[i] & results$alg == algs[j] & complete.cases(results),]

      ss.means <- aggregate(lhat ~ r + n + d, data=ss, FUN = nan.mean)
      rs <- ss.means$r; lhats <- ss.means$lhat
      min.result <- compute_cutoff(rs, lhats)
      r.star <- min.result$r
      ss.optimalr <- results[results$exp == exp_names[i] & results$alg == algs[j] & results$r == r.star,]
      if (alg == cmp.alg) {
        r.lol <- r.star
        lol.ss <- ss
        colnames(lol.ss)[colnames(lol.ss) == "lhat"] = sprintf("lhat.%s", cmp.algname)
        lol.optimalr <- ss.optimalr
        colnames(lol.optimalr)[colnames(lol.optimalr) == "lhat"] = sprintf("lhat.%s", cmp.algname)
        colnames(lol.optimalr)[colnames(lol.optimalr) == "rstar.norm"] = sprintf("rstar.norm.%s", cmp.algname)
      }
      colnames(ss)[colnames(ss) == "lhat"] = "lhat.alg"
      colnames(ss.optimalr)[colnames(ss.optimalr) == "lhat"] = "lhat.alg"
      ss.merged <- merge(ss, lol.ss, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv"))
      ss.merged <- merge(ss.merged, ss.chance, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv"), all=TRUE)

      ss.optimalr <- merge(ss.optimalr, lol.optimalr, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv"))
      ss.optimalr <- merge(ss.optimalr, ss.chance, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv"), all=TRUE)

      colnames(ss.merged)[colnames(ss.merged) == "r.x"] = "r.alg"
      colnames(ss.optimalr)[colnames(ss.optimalr) == "r.x"] = "r.alg"
      colnames(ss.merged)[colnames(ss.merged) == "r.y"] = sprintf("r.%s", cmp.algname)
      colnames(ss.optimalr)[colnames(ss.optimalr) == "r.y"] = sprintf("r.%s", cmp.algname)
      colnames(ss.merged)[colnames(ss.merged) == "alg"] <- "chance"
      colnames(ss.merged)[colnames(ss.merged) == "alg.x"] <- "alg"
      colnames(ss.optimalr)[colnames(ss.optimalr) == "alg"] <- "chance"
      colnames(ss.optimalr)[colnames(ss.optimalr) == "alg.x"] <- "alg"

      ss.merged$lhat.norm <- (ss.merged[[sprintf("lhat.%s", cmp.algname)]] - ss.merged$lhat.alg)/ss.merged$lhat.chance
      ss.merged$r.max <- r.max

      ss.optimalr$lhat.norm <- (ss.optimalr[[sprintf("lhat.%s", cmp.algname)]] - ss.optimalr$lhat.alg)/ss.optimalr$lhat.chance
      ss.optimalr$rstar.norm <- (ss.optimalr[[sprintf("r.%s", cmp.algname)]] - ss.optimalr$r.alg)/r.max
      ss.optimalr$r.max <- r.max
      results.overall <- rbind(results.overall, ss.merged)
      results.optimalr <- rbind(results.optimalr, ss.optimalr)
    }, error=function(e) {print(sprintf("exp: %d, alg: %d", i, j))})
  }
}
results.overall <- results.overall[complete.cases(results.overall),]
results.optimalr <- results.optimalr[complete.cases(results.optimalr),]

results.exp.overall <- aggregate(list(lhat.norm=results.overall$lhat.norm, lhat.alg=results.overall$lhat.alg,
                                       lhat.lol=results.overall[[sprintf("lhat.%s", cmp.algname)]],
                                      lhat.chance=results.overall$lhat.chance, r.max=results.overall$r.max),
                                  by=list(exp=results.overall$exp, n=results.overall$n, K=results.overall$K,
                                          ntrain=results.overall$ntrain, d=results.overall$d,
                                          repo=results.overall$repo, r.alg=results.overall$r.alg,
                                          alg=results.overall$alg), FUN=nan.mean)

results.exp.optimalr <- aggregate(list(lhat.norm=results.optimalr$lhat.norm, lhat.alg=results.optimalr$lhat.alg,
                                       lhat.lol=results.optimalr[[sprintf("lhat.%s", cmp.algname)]],
                                       lhat.chance=results.optimalr$lhat.chance,
                                       rstar.norm=results.optimalr$rstar.norm, r.max=results.optimalr$r.max),
                                  by=list(exp=results.optimalr$exp, n=results.optimalr$n, K=results.optimalr$K,
                                          ntrain=results.optimalr$ntrain, d=results.optimalr$d,
                                          repo=results.optimalr$repo, r.alg=results.optimalr$r.alg,
                                          alg=results.optimalr$alg), FUN=nan.mean)

Per-Dataset Plots

Given algorithm $a$ where $L_{a, j, i}$ is the $i^{th}$ fold's misclassification rate for dataset $j$ if the $D$ datasets, and $r \in [1, ..., p]$:

\begin{align} \bar{L}{a, j}(r) = \textrm{mean}{i \in [1, ..., k]}\left{L_{a, j, i}(r)\right} \end{align}

A single dot is produced where:

\begin{align} r^{a, j} = \textrm{argmin}_r\left{\bar{L}{a, j}(r)\right} \end{align*}

which is defined as the optimum number of embedding dimensions.

plots.curves <- list()
plots.scatters <- list()
for (i in 1:length(exp_names)) {
  exp <- exp_names[i]
  data_sub <- results.overall[results.overall$exp == exp,]
  data_sub.optimalr <- results.exp.optimalr[results.exp.optimalr$exp == exp,]
  data_sub.optimalr.foldwise <- results.optimalr[results.optimalr$exp == exp,]
  tryCatch({
    plots.curves[[i]] <- plot_sim_lhats(data_sub, data_sub.optimalr, acols, linestyle, shapes, ylab="",
                                 title=sprintf("Exp %d, K=%d, n=%d, p=%d", i, data_sub[1,]$K, data_sub[1,]$ntrain, data_sub[1,]$d), 
                                 from=1, fsize = 7)
    plots.scatters[[i]] <- plot_sim_scatter(data_sub.optimalr.foldwise, acols, shapes,
                                 title=sprintf("Exp %d, K=%d, n=%d, p=%d", i, data_sub[1,]$K, data_sub[1,]$ntrain, data_sub[1,]$d),
                                 fsize=12)

  }, error=function(e){NaN})
}

curves.plot_leg <- g_legend(plots.curves[[1]])
scatters.plot_leg <- g_legend(plots.scatters[[1]])
plots.curves <- lapply(plots.curves, function(plot) plot + theme(legend.position=NaN) + xlab("") + ylab(""))
plots.curves[[1]] <- plots.curves[[1]] + xlab("Embedded Dimensions") + ylab("Misclassification Rate")
plots.scatters <- lapply(plots.scatters, function(plot) plot + theme(legend.position=NaN) + xlab("") + ylab(""))
plots.scatters[[1]] <- plots.scatters[[1]] + xlab("Algorithm") +
  ylab(TeX("Normalized Misclassification Rate"))

grid.arrange(arrangeGrob(grobs=plots.curves, nrow=floor(sqrt(length(plots.curves)))), curves.plot_leg, nrow=2, heights=c(0.8, .15))
grid.arrange(arrangeGrob(grobs=plots.scatters, nrow=floor(sqrt(length(plots.scatters)))), scatters.plot_leg, nrow=2, heights=c(0.8, .15))

Quadrant Plot

For algorithm $a$ and problem $j$, the relative embedding dimension:

\begin{align} \left|\left|r^{a, j}\right|\right| = \frac{r^_{LOL, j} - r^{a, j}}{p} \end{align*}

with relative misclassification rate per-dataset:

\begin{align} \left|\left|\bar{L}{a, j}(r)\right|\right| = \textrm{mean}{i \in [1, ..., k]}\left{\frac{\bar{L}_{a, j, i}\left(r^{LOL, j}\right) - \bar{L}{a, j, i}\left(r^{a, j}\right)}{\bar{L}{chance, j, i}}\right} \end{align}

where the $chance$ classifier is simply the classifier that guesses the "most-present" class (the class with the highest prior) in the particular fold.

make_marginal_2d <- function(data, xlims, ylims, plot.title="", xl="", yl="", leg.title="",
                             legend.style=guide_legend(ncol=2, byrow=TRUE)) {
  data$exp <- factor(data$exp)
  box <- data.frame(x=c(min(xlims), mean(xlims), mean(xlims), min(xlims)),
                    y=c(min(ylims), min(ylims), mean(ylims), mean(ylims)))
  box2 <- data.frame(x=c(max(xlims), mean(xlims), mean(xlims), max(xlims)),
                    y=c(max(ylims), max(ylims), mean(ylims), mean(ylims)))
  data.medians <- aggregate(list(lhat.norm=data$lhat.norm, rstar.norm=data$rstar.norm), by=list(alg=data$alg), FUN=median)
  data.medians <- rbind(data.medians, data.frame(lhat.norm=0, rstar.norm=0, alg="LOL"))  # add dot for LOL
  # table results
  tab <- data.frame(alg=c(), q1=c(), q2=c(), q3=c(), q4=c())
  for (alg in levels(data$alg)) {
    ss <- data[data$alg == alg,]
    tab <- rbind(tab, data.frame(alg=alg, q1=sum(ss$rstar.norm >= 0 & ss$lhat.norm >= 0),
                                 q2 = sum(ss$rstar.norm <= 0 & ss$lhat.norm >= 0),
                                 q3 = sum(ss$rstar.norm <= 0 & ss$lhat.norm <= 0),
                                 q4 = sum(ss$rstar.norm >= 0 & ss$lhat.norm <= 0)))
  }
  tab <- rbind(tab, data.frame(alg="overall", q1=sum(data$rstar.norm >= 0 & data$lhat.norm >= 0),
                               q2 = sum(data$rstar.norm <= 0 & data$lhat.norm >= 0),
                               q3 = sum(data$rstar.norm <= 0 & data$lhat.norm <= 0),
                               q4 = sum(data$rstar.norm >= 0 & data$lhat.norm <= 0)))
  tab[,2:dim(tab)[2]] <- tab[,2:dim(tab)[2]]/apply(tab[,2:dim(tab)[2]], c(1), sum)
  print(tab)
  center <- ggplot(data, aes(x=rstar.norm, y=lhat.norm)) +
    geom_polygon(data=box, aes(x=x, y=y), fill='green', alpha=0.15) +
    geom_polygon(data=box2, aes(x=x, y=y), fill='red', alpha=0.15) +
    geom_point(aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=0.5, size=1.2) +
    geom_point(data=data.medians, aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=1.0, size=2.5) +
    scale_fill_manual(values=acols, guide=legend.style, name=leg.title) +
    scale_color_manual(values=acols, guide=legend.style, name=leg.title) +
    scale_shape_manual(values=shapes, guide=legend.style, name=leg.title) +
    ylab(yl) +
    xlab(xl) +
    labs(shape="Simulation", color="Algorithm") +
    ggtitle("") +
    scale_y_continuous(limits=ylims) +
    scale_x_continuous(limits=xlims) +
    theme_bw() +
    annotate("text", size=4, label="LOL Better", x=-.5, y=.1, color="green4") +
    annotate("text", size=4, label="LOL Better", x=1.0, y=-.05, angle=-90, color="green4") +
    annotate("text", size=4, label="LOL Worse", x=.5, y=.1, color="red") +
    annotate("text", size=4, label="LOL Worse", x=1.0, y=.05, angle=-90, color="red")
    # annotate("text", size=4, label=TeX("Q1"), x=.5, y=.055) +
    # annotate("text", size=4, label=TeX(sprintf("$r^* \\geq r^*_{%s}$", cmp.alg)), x=-.5, y=.1) +
    # annotate("text", size=4, label=TeX("Q2"), x=-.5, y=.055) +
    # annotate("text", size=4, label=TeX(sprintf("$L^* \\geq L^*_{%s}$", cmp.alg)), x=1.0, y=-.05, angle=-90) +
    # annotate("text", size=4, label=TeX("Q3"), x=-.5, y=-.05) +
    # annotate("text", size=4, label=TeX(sprintf("$r^* \\leq r^*_{%s}$", cmp.alg)), x=.5, y=.1) +
    # annotate("text", size=4, label=TeX("Q4"), x=.5, y=-.05) +
    # annotate("text", size=4, label=TeX(sprintf("$L^* \\leq L^*_{%s}$", cmp.alg)), x=1.0, y=.05, angle=-90)
  center_leg <- ggplot(data, aes(x=rstar.norm, y=lhat.norm)) +
    geom_polygon(data=box, aes(x=x, y=y), fill='red', alpha=0.15) +
    geom_line(aes(x=rstar.norm, y=lhat.norm, linetype=alg, color=alg), alpha=1.0, size=1.0) +
    geom_point(aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=0.4, size=1.2) +
    geom_point(data=data.medians, aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=1.0, size=2.5) +
    scale_fill_manual(values=acols, guide=legend.style, name=leg.title) +
    scale_color_manual(values=acols, guide=legend.style, name=leg.title) +
    scale_shape_manual(values=shapes, guide=legend.style, name=leg.title) +
    scale_linetype_manual(values=linestyle, guide=legend.style, name=leg.title) +
    ylab(yl) +
    xlab(xl) +
    labs(shape="Simulation", color="Algorithm") +
    ggtitle("") +
    scale_y_continuous(limits=ylims) +
    scale_x_continuous(limits=xlims) +
    theme_bw() +
    guides(linetype=FALSE)
  leg <- g_legend(center_leg)
  center <- center + theme(legend.position=NaN)
  right <- ggplot(data, aes(x=lhat.norm, y=..scaled.., color=alg, linetype=alg)) +
    geom_density(size=1.1) +
    scale_color_manual(values=acols, guide=legend.style, name=leg.title) +
    geom_vline(xintercept=0, ymin=0, ymax=1, color=as.character(acols["LOL"]), size=1.1) +
    scale_fill_manual(values=acols, guide=legend.style, name=leg.title) +
    scale_linetype_manual(values=linestyle, guide=legend.style, name=leg.title) +
    scale_x_continuous(limits=ylims) +
    ylab("Likelihood") +
    xlab("") +
    ggtitle("") +
    theme_bw() +
    theme(legend.position=NaN,
        axis.text.y=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +

    coord_flip()
  top <- ggplot(data, aes(x=rstar.norm, y=..scaled.., color=alg, linetype=alg)) +
    geom_density(size=1.1) +
    scale_color_manual(values=acols, guide=legend.style, name=leg.title) +
    scale_linetype_manual(values=linestyle, guide=legend.style, name=leg.title) +
    geom_vline(xintercept=0, ymin=0, ymax=1, color=as.character(acols["LOL"]), size=1.1) +
    scale_x_continuous(limits=xlims) +
    ylab("Likelihood") +
    xlab("") +
    ggtitle(plot.title) +
    theme_bw() + 
    theme(legend.position=NaN,
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
  return(arrangeGrob(top, leg, center + theme(legend.position=NaN), right, ncol=2, nrow=2, widths=c(4,1.5), heights=c(2,4)))
}
rn <- c("LOL", "PLS", "CCA","LDA", "PCA", "RP")
names(acols) <-rn
names(shapes) <- rn
names(linestyle) <- rn
results.exp.optimalr$alg <- revalue(results.exp.optimalr$alg, c("LRLDA"= "LDA"))
grid.arrange(make_marginal_2d(subset(results.exp.optimalr, !(alg %in% c(cmp.alg))), 
                              c(-1, 1), c(-.1, .1), plot.title="(A) Performance Across HDLSS Benchmarks", leg.title="Algorithm",
                              xl="Relative # Embedding Dimensions", yl="Relative Accuracy"))

plots.curves[[which(as.character(exp_names) == "mnist")]] <- plots.curves[[which(as.character(exp_names) == "mnist")]] +
  scale_x_continuous(breaks=c(1, 50, 100)) +
  scale_y_continuous(breaks=c(.15, .18, .21)) +
  coord_cartesian(xlim=c(1, 100), ylim=c(.15, .21))

plots.curves[[which(as.character(exp_names) == "MRN")]] <- plots.curves[[which(as.character(exp_names) == "MRN")]] +
  scale_x_continuous(breaks=c(1, 50, 100)) +
  scale_y_continuous(breaks=c(.14, .22, .3)) +
  coord_cartesian(xlim=c(1, 100), ylim=c(.14, .3))



dset.mnist <- plots.curves[[which(as.character(exp_names) == "mnist")]] + theme(text=element_text(size=10))
dset.mrn <- plots.curves[[which(as.character(exp_names) == "MRN")]] + theme(text=element_text(size=10))
scatter <- make_marginal_2d(subset(results.exp.optimalr, !(alg %in% c("LOL"))), 
                              c(-1, 1), c(-.1, .1), plot.title="(A) Performance Across HDLSS Benchmarks", leg.title="Algorithm",
                              xl="Normalized Embedding Dimension", yl="Normalized Misclassification Rate")

fig.5 <- grid.arrange(arrangeGrob(dset.mnist + ggtitle("(A) MNIST dataset, n=70000, d=784") + xlab("Embedding Dimension") + ylab("Misclassification Rate") + theme(text=element_text(size=9)),
                                  dset.mrn + ggtitle("(B) MRN dataset, n=114, d > 500,000,000") +  xlab("") + ylab("") + theme(text=element_text(size=9)),
                                  widths = c(0.5, 0.5), nrow=1),
                      nrow=2, heights=c(0.65, 0.35))

Aggregated over Datasets Plot

The average misclassification rate per-algorithm aggregated over all problems at $r_{a, j}^*$:

results.exp.optimalr.means <- aggregate(list(lhat.alg=results.exp.optimalr$lhat.alg, r.alg=results.exp.optimalr$r.alg),
                                        by=list(alg=results.exp.optimalr$alg), FUN=nan.mean)
print("Optimal")
for (j in 1:length(algs)) {
  print(sprintf("Mean Lhat for %s: %.3f", algs[j], results.exp.optimalr.means[results.exp.optimalr.means$alg == algs[j],]$lhat.alg))
  print(sprintf("Mean rstar for %s: %.3f", algs[j], results.exp.optimalr.means[results.exp.optimalr.means$alg == algs[j],]$r.alg))
}

ggplot(results.exp.optimalr.means, aes(x=r.alg, y=lhat.alg, shape=alg, fill=alg)) +
    geom_point(size=2) +
    scale_fill_manual(values=acols, guide=guide_legend(ncol=2, byrow=TRUE), name="Algorithm") +
    scale_shape_manual(values=shapes, guide=guide_legend(ncol=2, byrow=TRUE), name="Algorithm") +
    ylab("Misclassification Rate") +
    xlab("Embedding Dimension") +
    labs(shape="Simulation", color="Algorithm") +
    ggtitle("Real Data Average Results") +
    theme_bw()

Heatmaps

We use a one-sided wilcoxon test for the following 2 hypotheses for algorithms $u$ and $v$ per dataset $j$:

\begin{align} H_0^{r, u, v}: r^{u, j} \geq r^_{v, j} \ H_A^{r, u, v}: r^{u, j} < r^_{v, j} \end{align}

that the number of embedding dimensions on a single dataset is lower for algorithm $u$ than algorithm $v$, and:

\begin{align} H_0^{L, u, v}: \bar{L}_{u, j}\left(r^{u, u}\right) \geq \bar{L}{v, j}\left(r^{v, j}\right) \ H_A^{L, u, v}: \bar{L}{u, j}\left(r^{u, u}\right) < \bar{L}{v, j}\left(r^_{v, j}\right) \end{align}

that the misclassification rate on a single dataset is lower for algorithm $u$ than algorithm $v$.

rhat.test <- data.frame(x=c(), y=c(), p=c())
lhat.test <- data.frame(x=c(), y=c(), p=c())
dlhat.test <- data.frame(x=c(), y=c(), p=c())
for (i in 1:length(algs)) {
  i.ss <- results.exp.optimalr[results.exp.optimalr$alg == algs[i],]
  i.ss.d <- results.exp.overall[results.exp.overall$alg == algs[i],]
  for (j in 1:length(algs)) {
    tryCatch({
      if (algs[i] == algs[j]) {
        rhat.test <- rbind(rhat.test, data.frame(x=algs[i], y=algs[j], p=NaN))
        lhat.test <- rbind(lhat.test, data.frame(x=algs[i], y=algs[j], p=NaN))
        dlhat.test <- rbind(dlhat.test, data.frame(x=algs[i], y=algs[j], p=NaN))
      } else {
        j.ss <- results.exp.optimalr[results.exp.optimalr$alg == algs[j],]
        j.ss.d <- results.exp.overall[results.exp.overall$alg == algs[j],]
        cmp <- merge(i.ss, j.ss, by=c("exp"), all=TRUE)
        cmp.d <- merge(i.ss.d, j.ss.d, by=c("exp", "d", "n", "K", "ntrain", "repo", "r.max",
                                            "r.alg", "lhat.chance", "lhat.lol"), all=FALSE)
        rhat.test <- rbind(rhat.test, data.frame(x=algs[j], y=algs[i], p=wilcox.test(cmp$r.alg.x, cmp$r.alg.y,
                                                                                     alternative = "less", paired=TRUE)$p.value))
        lhat.test <- rbind(lhat.test, data.frame(x=algs[j], y=algs[i], p=wilcox.test(cmp$lhat.alg.x/cmp$lhat.chance.x, cmp$lhat.alg.y/cmp$lhat.chance.y,
                                                                                     alternative = "less", paired=TRUE)$p.value))
        k.cmp <- c()
        for (exp in exp_names) {
          cmp.d.exp <- cmp.d[cmp.d$exp == exp,]
          k.exp <- mean(cmp.d.exp$lhat.alg.x < cmp.d.exp$lhat.alg.y)
          if (!is.nan(k.exp)) {
            k.cmp <- c(k.cmp, k.exp)
          }
        }
        tstat.alt <- sum(k.cmp)
      }
    }, error=function(e){NaN})
  }
}

lhat.test$x <- factor(lhat.test$x, levels = algs); lhat.test$y <- factor(lhat.test$y, levels = algs)
rhat.test$x <- factor(rhat.test$x, levels = algs); rhat.test$y <- factor(rhat.test$y, levels = algs)
rhat.test$p[rhat.test$p < .001] = .001
lhat.test$p[lhat.test$p < .001] = .001
lhat.hmap <- ggplot(lhat.test, aes(x=x, y=y, fill=p)) +
  geom_tile() +
  geom_text(aes(label = round(p, 3))) +
  scale_fill_gradientn(name=TeX("$p$-value"), trans="log", breaks=c(0.001, 0.01, 0.1, 1),
                       colours=rev(c("#f2f0f7", "#cbc9e2", "#9e9ac8", "#6a51a3")),
                       limits=c(0.001, 1)) +
  ggtitle(TeX("(B) Test of whether $\\hat{L}_{v, j} < $\\hat{L}_{u, j}$")) +
  xlab("Algorithm u") +
  ylab("Algorithm v")

rhat.hmap <- ggplot(rhat.test, aes(x=x, y=y, fill=p)) +
  geom_tile() +
  geom_text(aes(label = round(p, 3))) +
  scale_fill_gradientn(name=TeX("$p$-value"), trans="log", breaks=c(0.001, 0.01, 0.1, 1),
                       colours=rev(c("#f2f0f7", "#cbc9e2", "#9e9ac8", "#6a51a3")),
                       limits=c(0.001, 1)) +
  ggtitle(TeX("(C) Test of whether $r_{v, j} < r_{u, j}$")) +
  xlab("Algorithm u") +
  ylab("Algorithm v")

print(lhat.hmap)
print(rhat.hmap)

Per-Dataset

The normalized misclassification rate per-dataset in a beeswarm plot:

legend.style=guide_legend(ncol=2, byrow=TRUE)
ggplot(dat=subset(results.exp.optimalr, alg != "LOL"), aes(x=alg, y=lhat.norm, color=alg, group=alg)) +
  geom_beeswarm(alpha=0.5, size=2) +
  xlab("Algorithm") +
  scale_y_continuous(limits=c(-0.2, .15)) +
  scale_color_manual(values=acols, guide=legend.style, name="Algorithm") +
  ylab(TeX("$Mean\\left(\\frac{\\hat{L}_{LOL} - \\hat{L}_{Alg}}{\\hat{L}_{chance}}\\right)$")) +
  theme_bw()

rs <- plot.normlol.results$r[plot.normlol.results$alg == "PLS"]
lhats <- plot.normlol.results$lhat[plot.normlol.results$alg == "PLS"]
print(sprintf("Lower-Left (r <= 0 & lhat <= 0): %.3f", sum(rs <= 0 & lhats <= 0)/length(rs)))
print(sprintf("Lower-Right (r >= 0 & lhat <= 0): %.3f", sum(rs >= 0 & lhats <= 0)/length(rs)))
print(sprintf("Top-Left (r <= 0 & lhat >= 0): %.3f", sum(rs <= 0 & lhats >= 0)/length(rs)))
print(sprintf("Top-Right (r >= 0 & lhat >= 0): %.3f", sum(rs >= 0 & lhats >= 0)/length(rs)))
print(sprintf("Center (r == 0 & lhat == 0): %.3f", sum(rs == 0 & lhats == 0)/length(rs)))

And the difference between the normalized misclassification of LOL with that of PLS as a function of several different variables:

pls.ss <- subset(results.exp.optimalr, alg == "PLS")
pls.ss$lhat.mean <- apply(pls.ss[, c("lhat.alg", "lhat.lol")], 1, function(x) {as.numeric(mean(x[1], x[2], na.rm=TRUE))})
chancep <- ggplot(pls.ss, aes(x=lhat.chance, y=lhat.norm)) +
  geom_point(size=1.5) +
  geom_smooth(method = "loess", size = 1.5) +
  xlab(TeX("$\\hat{L}_{chance}$")) +
  ylab("")

pp <- ggplot(pls.ss, aes(x=d, y=lhat.norm)) +
  geom_point(size=1.5) +
  geom_smooth(method = "loess", size = 1.5) +
  xlab(TeX("p")) +
  ylab("") +
  scale_x_continuous(trans=log10_trans())

np <- ggplot(pls.ss, aes(x=n, y=lhat.norm)) +
  geom_point(size=1.5) +
  geom_smooth(method = "loess", size = 1.5) +
  xlab(TeX("n")) +
  ylab("") +
  scale_x_continuous(trans=log10_trans())

npp <- ggplot(pls.ss, aes(x=n/d, y=lhat.norm)) +
  geom_point(size=1.5) +
  geom_smooth(method = "loess", size = 1.5) +
  xlab(TeX("n/p")) +
  ylab("") +
  scale_x_continuous(trans=log10_trans())


kp <- ggplot(pls.ss, aes(x=K, y=lhat.norm)) +
  geom_point(size=1.5) +
  geom_smooth(method = "loess", size = 1.5) +
  ylab("") +
  xlab(TeX("K"))

mp <- ggplot(pls.ss, aes(x=lhat.mean, y=lhat.norm)) +
  geom_point(size=1.5) +
  geom_smooth(method = "loess", size = 1.5) +
  ylab("") +
  xlab(TeX("$Mean\\left(\\hat{L}_{LOL}, \\hat{L}_{PLS}\\right)$"))

pls.ss$alpha <- ifelse(pls.ss$lhat.norm < 0, 1, 0)
pls.ss$alpha <- factor(pls.ss$alpha, levels=c(1, 0))
acols <- c("#00ff00", "#ff0000")
names(acols) <- c(1, 0)
labs.alpha <- lapply(c("$\\hat{L}_{LOL} < \\hat{L}_{PLS}$", "$\\hat{L}_{LOL} > \\hat{L}_{PLS}$"), TeX)

np_diffp <- ggplot(pls.ss, aes(x=n, y=d, color=alpha)) +
  geom_point(size=1.5) +
  scale_x_continuous(trans=log10_trans()) +
  scale_color_manual(values=acols, labels=labs.alpha,
                     name=TeX("Normalized $Mean\\left(\\frac{\\hat{L}_{LOL} - \\hat{L}_{PLS}}{\\hat{L}_{chance}}\\right)$"))

layt.mtx <- rbind(c(1, 2, 3, 7, 7, 7),
                  c(4, 5, 6, 7, 7, 7))
grid.arrange(arrangeGrob(chancep, pp, np, npp, kp, mp, np_diffp,
             left=TeX("$Mean\\left(\\frac{\\hat{L}_{LOL} - \\hat{L}_{PLS}}{\\hat{L}_{chance}}\\right)$"),
             layout_matrix=layt.mtx))