In this notebook, we compare the performance of LOL to that of several other linear embedding techniques across the problems from the UCI repository and PMLB repository that can be classified as high-dimensionality, low sample-size (HDLSS) where $d > 100$. The data below was collected with $k=50$ fold validation. Testing sets were rotated across all folds, with the training set comprising the remaining $k-1=49$ folds. As only a handful of problems are low-rank and therefore HDLSS $n < d$, we subsample the training set to be $\textrm{min}(n\frac{k-1}{k}, d-1)$ where $n$ is the number of provided samples, $\frac{k-1}{k} = \frac{49}{50}$ is the fraction of samples for training, and $d$ is the native dimensionality of the data. This ensures that all examples shown below are on HDLSS data. The datasets were run using the real data driver script.
knitr::opts_chunk$set(warning = FALSE)
require(lolR) require(ggplot2) require(latex2exp) require(MASS) require(gridExtra) require(data.table) require(reshape2) require(R.matlab) require(grid) require(plyr) require(slb) require(scales) require(stringr) require(ggbeeswarm) classifier.name <- "lda" opath <- './data/real_data' repo.name <- "uci" cmp.alg="LOL" cmp.algname="lol" algs <- c("LOL", "PLS", "CCA", "LRLDA", "PCA", "RP") acols <- c("#f41711", "#94d6c9", "#87c6cc", "#99b4c6", "#020202", "#5f8793") linestyle <- c("solid", "dotted", "solid","dotted", "solid", "solid") names(linestyle) <- algs names(algs) <- acols names(acols) <- algs #shapes <- c(21, 24, 21, 24, 23, 23, 21, 24, 23) shapes <- c(21, 24, 21, 22, 21, 23) names(shapes) <- algs # algs <- c("RLOL", "RLRLDA", "RPCA") # acols <- c("#f41711", "#99b4c6", "#020202") # linestyle <- c("solid", "dotted", "solid") # names(linestyle) <- algs # names(algs) <- acols # names(acols) <- algs # #shapes <- c(21, 24, 21, 24, 23, 23, 21, 24, 23) # shapes <- c(21, 22, 21) # names(shapes) <- algs # compute the cutoff for the particular trial to get an approximate elbow # by computing the smallest r with an associated lhat within 5% # of the global minimum lhat compute_cutoff <- function(rs, lhats, t=0) { rs <- rs[complete.cases(lhats) & complete.cases(rs)]; lhats <- lhats[complete.cases(lhats) & complete.cases(rs)] sr.ix <- sort(rs, decreasing=FALSE, index.return=TRUE)$ix # compute minimum value min.lhat <- min(lhats) # compute minimum value + 5% lhat.thresh <- (1 + t)*min.lhat # find which indices are all below this lhat.below <- which(lhats <= lhat.thresh) rs.below <- rs[lhat.below]; lhats.below <- lhats[lhat.below] tmin.ix <- min(rs.below, index.return=TRUE) return(list(r=rs.below[tmin.ix], lhat=lhats.below[tmin.ix])) } w=.8 h=.2 plot_sim_lhats <- function(data_sub, data_sub.optimalr, cols, linetype, shape, title="", from=10, ylab=TeX("$\\hat{L}$"), xlab="Embedded Dimensions", fsize=12, length.out=3) { lims <- c(floor(100*min(data_sub.optimalr$lhat.alg, na.rm=TRUE))/100, ceiling(100*max(data_sub.optimalr$lhat.alg, na.rm=TRUE))/100) if (is.na(sum(lims))) { return(ggplot()) } tryCatch({ breaks = unique(round(seq(from=lims[1], to=lims[2], length.out = length.out), digits=1)) xlims <- c(min(data_sub$r.alg, na.rm=TRUE), max(data_sub$r.alg, na.rm=TRUE)) xbreaks <- round(seq(from=from, to=xlims[2], length.out=length.out)) plot_sims <- ggplot(data_sub, aes(x=r.alg, y=lhat.alg, linetype=alg, shape=alg, fill=alg, color=alg)) + stat_summary(size=.95, fun.y="mean", geom="line") + #stat_summary(fun.data = mean_cl_normal, geom = "errorbar", fun.args = list(mult = 1)) + geom_point(data=data_sub.optimalr, aes(x=r.alg, y=lhat.alg, shape=alg, fill=alg, color=alg), size=2) + scale_color_manual(values=cols, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + scale_fill_manual(values=cols, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + scale_shape_manual(values=shape, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + scale_linetype_manual(values=linetype, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + xlab(xlab) + ylab(ylab) + ggtitle(title) + theme_bw() + coord_cartesian(ylim=lims) + scale_x_continuous(limits=xlims, breaks=xbreaks) +#, trans=log10_trans()) + theme(plot.margin = unit(c(h,w,h,h), "cm")) + theme(legend.position="bottom", text=element_text(size=fsize)) return(plot_sims) }, error=function(e) {return(ggplot())}) } plot_sim_scatter <- function(data.sub.foldwise, cols, shape, title="", ylab=TeX("$\\frac{\\hat{L}_{LOL} - \\hat{L}_{alg}}{\\hat{L}_{chance}}$"), psize=1, xlab="Algorithm", fsize=12) { tryCatch({ plot_sims <- ggplot(dat=subset(data.sub.foldwise, alg != "LOL"), aes(x=alg, y=lhat.norm, color=alg, group=alg)) + geom_beeswarm(alpha=0.5, size=psize) + xlab("Algorithm") + scale_color_manual(values=cols, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + scale_fill_manual(values=cols, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + scale_shape_manual(values=shape, limits=names(cols), guide=guide_legend(nrow=2, byrow=TRUE), name="Algorithm") + xlab(xlab) + ylab(ylab) + ggtitle(title) + theme_bw() + theme(plot.margin = unit(c(h,w,h,h), "cm")) + theme(legend.position="bottom", text=element_text(size=fsize)) return(plot_sims) }, error=function(e) {return(ggplot())}) } g_legend<-function(a.gplot){ tmp <- ggplot_gtable(ggplot_build(a.gplot)) leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") legend <- tmp$grobs[[leg]] return(legend) } parse_mrn <- function() { data <- list(LOL=lol.mrn, PCA=pca.mrn, LRLDA=lrlda.mrn) result <- data.frame() for (alg in names(data)) { colnames(data[[alg]]) <- c(1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100) for (fold in 1:dim(data[[alg]])[1]) { result <- rbind(result, data.frame(exp="MRN", alg=alg, xv=40, n=114, ntrain=111, d=500000000, K=2,fold=fold, r=colnames(data[[alg]]),lhat=as.numeric(data[[alg]][fold,]), repo="neurodata")) } } result <- rbind(result, data.frame(exp="MRN", alg="RandomGuess", xv=40, n=114, ntrain=111, d=500000000, K=2, fold=1:dim(pca.mrn)[1], r=NaN, lhat=0.5, repo="neurodata")) return(result) } lol.mrn <- read.csv('./data/real_data/LOL-MRN-40.csv') pca.mrn <- read.csv('./data/real_data/PCA-LDA-MRN-40.csv') lrlda.mrn <- read.csv('./data/real_data/RR-LDA-MRN-40.csv') results.mrn <- parse_mrn()
results <- readRDS(file.path(opath, paste(classifier.name, "_results.rds", sep=""))) results <- rbind(results, results.mrn) exp_names <- unique(as.character(results$exp)) # filter out bad rows results <- results[complete.cases(results$lhat) & !(is.infinite(results$lhat)) & complete.cases(results[colnames(results) != "r"]),] exp_names <- unique(as.character(results$exp)) # make sure columns of interest are numeric numeric.cols <- c("d", "n", "ntrain", "K", "fold", "lhat", "r") for (col in numeric.cols) { results[[col]] <- as.numeric(results[[col]]) if (col == "ntrain") { # make sure that ntrain is constant across experiments, due to situation where one fold has fewer examples than another for (exp in exp_names) { results[results$exp == exp,]$ntrain = median(results[results$exp == exp,]$ntrain, na.rm=TRUE) } } } results <- results[results$d > 100 & results$ntrain/results$K > 10,] # subset out datasets with low d and incredibly low sample size:class ratio # filter out duplicate experiments results <- results[!(results$exp %in% c("splice", "promoters", "molecular-biology_promoters", "rorb")),] nan.mean <- function(x) {mean(x[!is.infinite(x)], na.rm=TRUE)} results.means <- aggregate(lhat ~ exp + alg + r + d + n + ntrain + K, data = results, FUN = nan.mean) random.results <- aggregate(lhat ~ exp + alg, data=subset(results, alg == "RandomGuess"), FUN=mean)
#nan.median <- function(x) median(x, na.rm=TRUE) lhat.mean <- 1 k.fold <- length(unique(results$fold)) #results.medians <- aggregate(lhat ~ exp + alg + r + d + n + K, data = results, FUN = nan.median) results.optimalr <- data.frame() results.overall <- data.frame() for (i in 1:length(exp_names)) { r.max <- max(results.means[results.means$exp == as.character(exp_names[i]),]$r) ss.chance <- results[results$alg == "RandomGuess" & results$exp == as.character(exp_names[i]),] colnames(ss.chance)[colnames(ss.chance) == "lhat"] = "lhat.chance" ss.chance <- ss.chance[, colnames(ss.chance) != "r"] for (j in 1:length(algs)) { tryCatch({ alg <- as.character(algs[j]) ss <- results[results$exp == exp_names[i] & results$alg == algs[j] & complete.cases(results),] ss.means <- aggregate(lhat ~ r + n + d, data=ss, FUN = nan.mean) rs <- ss.means$r; lhats <- ss.means$lhat min.result <- compute_cutoff(rs, lhats) r.star <- min.result$r ss.optimalr <- results[results$exp == exp_names[i] & results$alg == algs[j] & results$r == r.star,] if (alg == cmp.alg) { r.lol <- r.star lol.ss <- ss colnames(lol.ss)[colnames(lol.ss) == "lhat"] = sprintf("lhat.%s", cmp.algname) lol.optimalr <- ss.optimalr colnames(lol.optimalr)[colnames(lol.optimalr) == "lhat"] = sprintf("lhat.%s", cmp.algname) colnames(lol.optimalr)[colnames(lol.optimalr) == "rstar.norm"] = sprintf("rstar.norm.%s", cmp.algname) } colnames(ss)[colnames(ss) == "lhat"] = "lhat.alg" colnames(ss.optimalr)[colnames(ss.optimalr) == "lhat"] = "lhat.alg" ss.merged <- merge(ss, lol.ss, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv")) ss.merged <- merge(ss.merged, ss.chance, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv"), all=TRUE) ss.optimalr <- merge(ss.optimalr, lol.optimalr, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv")) ss.optimalr <- merge(ss.optimalr, ss.chance, by=c("exp", "fold", "n", "d", "K", "ntrain", "repo", "xv"), all=TRUE) colnames(ss.merged)[colnames(ss.merged) == "r.x"] = "r.alg" colnames(ss.optimalr)[colnames(ss.optimalr) == "r.x"] = "r.alg" colnames(ss.merged)[colnames(ss.merged) == "r.y"] = sprintf("r.%s", cmp.algname) colnames(ss.optimalr)[colnames(ss.optimalr) == "r.y"] = sprintf("r.%s", cmp.algname) colnames(ss.merged)[colnames(ss.merged) == "alg"] <- "chance" colnames(ss.merged)[colnames(ss.merged) == "alg.x"] <- "alg" colnames(ss.optimalr)[colnames(ss.optimalr) == "alg"] <- "chance" colnames(ss.optimalr)[colnames(ss.optimalr) == "alg.x"] <- "alg" ss.merged$lhat.norm <- (ss.merged[[sprintf("lhat.%s", cmp.algname)]] - ss.merged$lhat.alg)/ss.merged$lhat.chance ss.merged$r.max <- r.max ss.optimalr$lhat.norm <- (ss.optimalr[[sprintf("lhat.%s", cmp.algname)]] - ss.optimalr$lhat.alg)/ss.optimalr$lhat.chance ss.optimalr$rstar.norm <- (ss.optimalr[[sprintf("r.%s", cmp.algname)]] - ss.optimalr$r.alg)/r.max ss.optimalr$r.max <- r.max results.overall <- rbind(results.overall, ss.merged) results.optimalr <- rbind(results.optimalr, ss.optimalr) }, error=function(e) {print(sprintf("exp: %d, alg: %d", i, j))}) } } results.overall <- results.overall[complete.cases(results.overall),] results.optimalr <- results.optimalr[complete.cases(results.optimalr),] results.exp.overall <- aggregate(list(lhat.norm=results.overall$lhat.norm, lhat.alg=results.overall$lhat.alg, lhat.lol=results.overall[[sprintf("lhat.%s", cmp.algname)]], lhat.chance=results.overall$lhat.chance, r.max=results.overall$r.max), by=list(exp=results.overall$exp, n=results.overall$n, K=results.overall$K, ntrain=results.overall$ntrain, d=results.overall$d, repo=results.overall$repo, r.alg=results.overall$r.alg, alg=results.overall$alg), FUN=nan.mean) results.exp.optimalr <- aggregate(list(lhat.norm=results.optimalr$lhat.norm, lhat.alg=results.optimalr$lhat.alg, lhat.lol=results.optimalr[[sprintf("lhat.%s", cmp.algname)]], lhat.chance=results.optimalr$lhat.chance, rstar.norm=results.optimalr$rstar.norm, r.max=results.optimalr$r.max), by=list(exp=results.optimalr$exp, n=results.optimalr$n, K=results.optimalr$K, ntrain=results.optimalr$ntrain, d=results.optimalr$d, repo=results.optimalr$repo, r.alg=results.optimalr$r.alg, alg=results.optimalr$alg), FUN=nan.mean)
Given algorithm $a$ where $L_{a, j, i}$ is the $i^{th}$ fold's misclassification rate for dataset $j$ if the $D$ datasets, and $r \in [1, ..., p]$:
\begin{align} \bar{L}{a, j}(r) = \textrm{mean}{i \in [1, ..., k]}\left{L_{a, j, i}(r)\right} \end{align}
A single dot is produced where:
\begin{align} r^{a, j} = \textrm{argmin}_r\left{\bar{L}{a, j}(r)\right} \end{align*}
which is defined as the optimum number of embedding dimensions.
plots.curves <- list() plots.scatters <- list() for (i in 1:length(exp_names)) { exp <- exp_names[i] data_sub <- results.overall[results.overall$exp == exp,] data_sub.optimalr <- results.exp.optimalr[results.exp.optimalr$exp == exp,] data_sub.optimalr.foldwise <- results.optimalr[results.optimalr$exp == exp,] tryCatch({ plots.curves[[i]] <- plot_sim_lhats(data_sub, data_sub.optimalr, acols, linestyle, shapes, ylab="", title=sprintf("Exp %d, K=%d, n=%d, p=%d", i, data_sub[1,]$K, data_sub[1,]$ntrain, data_sub[1,]$d), from=1, fsize = 7) plots.scatters[[i]] <- plot_sim_scatter(data_sub.optimalr.foldwise, acols, shapes, title=sprintf("Exp %d, K=%d, n=%d, p=%d", i, data_sub[1,]$K, data_sub[1,]$ntrain, data_sub[1,]$d), fsize=12) }, error=function(e){NaN}) } curves.plot_leg <- g_legend(plots.curves[[1]]) scatters.plot_leg <- g_legend(plots.scatters[[1]]) plots.curves <- lapply(plots.curves, function(plot) plot + theme(legend.position=NaN) + xlab("") + ylab("")) plots.curves[[1]] <- plots.curves[[1]] + xlab("Embedded Dimensions") + ylab("Misclassification Rate") plots.scatters <- lapply(plots.scatters, function(plot) plot + theme(legend.position=NaN) + xlab("") + ylab("")) plots.scatters[[1]] <- plots.scatters[[1]] + xlab("Algorithm") + ylab(TeX("Normalized Misclassification Rate")) grid.arrange(arrangeGrob(grobs=plots.curves, nrow=floor(sqrt(length(plots.curves)))), curves.plot_leg, nrow=2, heights=c(0.8, .15)) grid.arrange(arrangeGrob(grobs=plots.scatters, nrow=floor(sqrt(length(plots.scatters)))), scatters.plot_leg, nrow=2, heights=c(0.8, .15))
For algorithm $a$ and problem $j$, the relative embedding dimension:
\begin{align} \left|\left|r^{a, j}\right|\right| = \frac{r^_{LOL, j} - r^{a, j}}{p} \end{align*}
with relative misclassification rate per-dataset:
\begin{align} \left|\left|\bar{L}{a, j}(r)\right|\right| = \textrm{mean}{i \in [1, ..., k]}\left{\frac{\bar{L}_{a, j, i}\left(r^{LOL, j}\right) - \bar{L}{a, j, i}\left(r^{a, j}\right)}{\bar{L}{chance, j, i}}\right} \end{align}
where the $chance$ classifier is simply the classifier that guesses the "most-present" class (the class with the highest prior) in the particular fold.
make_marginal_2d <- function(data, xlims, ylims, plot.title="", xl="", yl="", leg.title="", legend.style=guide_legend(ncol=2, byrow=TRUE)) { data$exp <- factor(data$exp) box <- data.frame(x=c(min(xlims), mean(xlims), mean(xlims), min(xlims)), y=c(min(ylims), min(ylims), mean(ylims), mean(ylims))) box2 <- data.frame(x=c(max(xlims), mean(xlims), mean(xlims), max(xlims)), y=c(max(ylims), max(ylims), mean(ylims), mean(ylims))) data.medians <- aggregate(list(lhat.norm=data$lhat.norm, rstar.norm=data$rstar.norm), by=list(alg=data$alg), FUN=median) data.medians <- rbind(data.medians, data.frame(lhat.norm=0, rstar.norm=0, alg="LOL")) # add dot for LOL # table results tab <- data.frame(alg=c(), q1=c(), q2=c(), q3=c(), q4=c()) for (alg in levels(data$alg)) { ss <- data[data$alg == alg,] tab <- rbind(tab, data.frame(alg=alg, q1=sum(ss$rstar.norm >= 0 & ss$lhat.norm >= 0), q2 = sum(ss$rstar.norm <= 0 & ss$lhat.norm >= 0), q3 = sum(ss$rstar.norm <= 0 & ss$lhat.norm <= 0), q4 = sum(ss$rstar.norm >= 0 & ss$lhat.norm <= 0))) } tab <- rbind(tab, data.frame(alg="overall", q1=sum(data$rstar.norm >= 0 & data$lhat.norm >= 0), q2 = sum(data$rstar.norm <= 0 & data$lhat.norm >= 0), q3 = sum(data$rstar.norm <= 0 & data$lhat.norm <= 0), q4 = sum(data$rstar.norm >= 0 & data$lhat.norm <= 0))) tab[,2:dim(tab)[2]] <- tab[,2:dim(tab)[2]]/apply(tab[,2:dim(tab)[2]], c(1), sum) print(tab) center <- ggplot(data, aes(x=rstar.norm, y=lhat.norm)) + geom_polygon(data=box, aes(x=x, y=y), fill='green', alpha=0.15) + geom_polygon(data=box2, aes(x=x, y=y), fill='red', alpha=0.15) + geom_point(aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=0.5, size=1.2) + geom_point(data=data.medians, aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=1.0, size=2.5) + scale_fill_manual(values=acols, guide=legend.style, name=leg.title) + scale_color_manual(values=acols, guide=legend.style, name=leg.title) + scale_shape_manual(values=shapes, guide=legend.style, name=leg.title) + ylab(yl) + xlab(xl) + labs(shape="Simulation", color="Algorithm") + ggtitle("") + scale_y_continuous(limits=ylims) + scale_x_continuous(limits=xlims) + theme_bw() + annotate("text", size=4, label="LOL Better", x=-.5, y=.1, color="green4") + annotate("text", size=4, label="LOL Better", x=1.0, y=-.05, angle=-90, color="green4") + annotate("text", size=4, label="LOL Worse", x=.5, y=.1, color="red") + annotate("text", size=4, label="LOL Worse", x=1.0, y=.05, angle=-90, color="red") # annotate("text", size=4, label=TeX("Q1"), x=.5, y=.055) + # annotate("text", size=4, label=TeX(sprintf("$r^* \\geq r^*_{%s}$", cmp.alg)), x=-.5, y=.1) + # annotate("text", size=4, label=TeX("Q2"), x=-.5, y=.055) + # annotate("text", size=4, label=TeX(sprintf("$L^* \\geq L^*_{%s}$", cmp.alg)), x=1.0, y=-.05, angle=-90) + # annotate("text", size=4, label=TeX("Q3"), x=-.5, y=-.05) + # annotate("text", size=4, label=TeX(sprintf("$r^* \\leq r^*_{%s}$", cmp.alg)), x=.5, y=.1) + # annotate("text", size=4, label=TeX("Q4"), x=.5, y=-.05) + # annotate("text", size=4, label=TeX(sprintf("$L^* \\leq L^*_{%s}$", cmp.alg)), x=1.0, y=.05, angle=-90) center_leg <- ggplot(data, aes(x=rstar.norm, y=lhat.norm)) + geom_polygon(data=box, aes(x=x, y=y), fill='red', alpha=0.15) + geom_line(aes(x=rstar.norm, y=lhat.norm, linetype=alg, color=alg), alpha=1.0, size=1.0) + geom_point(aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=0.4, size=1.2) + geom_point(data=data.medians, aes(x=rstar.norm, y=lhat.norm, shape=alg, color=alg, fill=alg), alpha=1.0, size=2.5) + scale_fill_manual(values=acols, guide=legend.style, name=leg.title) + scale_color_manual(values=acols, guide=legend.style, name=leg.title) + scale_shape_manual(values=shapes, guide=legend.style, name=leg.title) + scale_linetype_manual(values=linestyle, guide=legend.style, name=leg.title) + ylab(yl) + xlab(xl) + labs(shape="Simulation", color="Algorithm") + ggtitle("") + scale_y_continuous(limits=ylims) + scale_x_continuous(limits=xlims) + theme_bw() + guides(linetype=FALSE) leg <- g_legend(center_leg) center <- center + theme(legend.position=NaN) right <- ggplot(data, aes(x=lhat.norm, y=..scaled.., color=alg, linetype=alg)) + geom_density(size=1.1) + scale_color_manual(values=acols, guide=legend.style, name=leg.title) + geom_vline(xintercept=0, ymin=0, ymax=1, color=as.character(acols["LOL"]), size=1.1) + scale_fill_manual(values=acols, guide=legend.style, name=leg.title) + scale_linetype_manual(values=linestyle, guide=legend.style, name=leg.title) + scale_x_continuous(limits=ylims) + ylab("Likelihood") + xlab("") + ggtitle("") + theme_bw() + theme(legend.position=NaN, axis.text.y=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip() top <- ggplot(data, aes(x=rstar.norm, y=..scaled.., color=alg, linetype=alg)) + geom_density(size=1.1) + scale_color_manual(values=acols, guide=legend.style, name=leg.title) + scale_linetype_manual(values=linestyle, guide=legend.style, name=leg.title) + geom_vline(xintercept=0, ymin=0, ymax=1, color=as.character(acols["LOL"]), size=1.1) + scale_x_continuous(limits=xlims) + ylab("Likelihood") + xlab("") + ggtitle(plot.title) + theme_bw() + theme(legend.position=NaN, axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) return(arrangeGrob(top, leg, center + theme(legend.position=NaN), right, ncol=2, nrow=2, widths=c(4,1.5), heights=c(2,4))) } rn <- c("LOL", "PLS", "CCA","LDA", "PCA", "RP") names(acols) <-rn names(shapes) <- rn names(linestyle) <- rn results.exp.optimalr$alg <- revalue(results.exp.optimalr$alg, c("LRLDA"= "LDA")) grid.arrange(make_marginal_2d(subset(results.exp.optimalr, !(alg %in% c(cmp.alg))), c(-1, 1), c(-.1, .1), plot.title="(A) Performance Across HDLSS Benchmarks", leg.title="Algorithm", xl="Relative # Embedding Dimensions", yl="Relative Accuracy"))
plots.curves[[which(as.character(exp_names) == "mnist")]] <- plots.curves[[which(as.character(exp_names) == "mnist")]] + scale_x_continuous(breaks=c(1, 50, 100)) + scale_y_continuous(breaks=c(.15, .18, .21)) + coord_cartesian(xlim=c(1, 100), ylim=c(.15, .21)) plots.curves[[which(as.character(exp_names) == "MRN")]] <- plots.curves[[which(as.character(exp_names) == "MRN")]] + scale_x_continuous(breaks=c(1, 50, 100)) + scale_y_continuous(breaks=c(.14, .22, .3)) + coord_cartesian(xlim=c(1, 100), ylim=c(.14, .3)) dset.mnist <- plots.curves[[which(as.character(exp_names) == "mnist")]] + theme(text=element_text(size=10)) dset.mrn <- plots.curves[[which(as.character(exp_names) == "MRN")]] + theme(text=element_text(size=10)) scatter <- make_marginal_2d(subset(results.exp.optimalr, !(alg %in% c("LOL"))), c(-1, 1), c(-.1, .1), plot.title="(A) Performance Across HDLSS Benchmarks", leg.title="Algorithm", xl="Normalized Embedding Dimension", yl="Normalized Misclassification Rate") fig.5 <- grid.arrange(arrangeGrob(dset.mnist + ggtitle("(A) MNIST dataset, n=70000, d=784") + xlab("Embedding Dimension") + ylab("Misclassification Rate") + theme(text=element_text(size=9)), dset.mrn + ggtitle("(B) MRN dataset, n=114, d > 500,000,000") + xlab("") + ylab("") + theme(text=element_text(size=9)), widths = c(0.5, 0.5), nrow=1), nrow=2, heights=c(0.65, 0.35))
The average misclassification rate per-algorithm aggregated over all problems at $r_{a, j}^*$:
results.exp.optimalr.means <- aggregate(list(lhat.alg=results.exp.optimalr$lhat.alg, r.alg=results.exp.optimalr$r.alg), by=list(alg=results.exp.optimalr$alg), FUN=nan.mean) print("Optimal") for (j in 1:length(algs)) { print(sprintf("Mean Lhat for %s: %.3f", algs[j], results.exp.optimalr.means[results.exp.optimalr.means$alg == algs[j],]$lhat.alg)) print(sprintf("Mean rstar for %s: %.3f", algs[j], results.exp.optimalr.means[results.exp.optimalr.means$alg == algs[j],]$r.alg)) } ggplot(results.exp.optimalr.means, aes(x=r.alg, y=lhat.alg, shape=alg, fill=alg)) + geom_point(size=2) + scale_fill_manual(values=acols, guide=guide_legend(ncol=2, byrow=TRUE), name="Algorithm") + scale_shape_manual(values=shapes, guide=guide_legend(ncol=2, byrow=TRUE), name="Algorithm") + ylab("Misclassification Rate") + xlab("Embedding Dimension") + labs(shape="Simulation", color="Algorithm") + ggtitle("Real Data Average Results") + theme_bw()
We use a one-sided wilcoxon test for the following 2 hypotheses for algorithms $u$ and $v$ per dataset $j$:
\begin{align} H_0^{r, u, v}: r^{u, j} \geq r^_{v, j} \ H_A^{r, u, v}: r^{u, j} < r^_{v, j} \end{align}
that the number of embedding dimensions on a single dataset is lower for algorithm $u$ than algorithm $v$, and:
\begin{align} H_0^{L, u, v}: \bar{L}_{u, j}\left(r^{u, u}\right) \geq \bar{L}{v, j}\left(r^{v, j}\right) \ H_A^{L, u, v}: \bar{L}{u, j}\left(r^{u, u}\right) < \bar{L}{v, j}\left(r^_{v, j}\right) \end{align}
that the misclassification rate on a single dataset is lower for algorithm $u$ than algorithm $v$.
rhat.test <- data.frame(x=c(), y=c(), p=c()) lhat.test <- data.frame(x=c(), y=c(), p=c()) dlhat.test <- data.frame(x=c(), y=c(), p=c()) for (i in 1:length(algs)) { i.ss <- results.exp.optimalr[results.exp.optimalr$alg == algs[i],] i.ss.d <- results.exp.overall[results.exp.overall$alg == algs[i],] for (j in 1:length(algs)) { tryCatch({ if (algs[i] == algs[j]) { rhat.test <- rbind(rhat.test, data.frame(x=algs[i], y=algs[j], p=NaN)) lhat.test <- rbind(lhat.test, data.frame(x=algs[i], y=algs[j], p=NaN)) dlhat.test <- rbind(dlhat.test, data.frame(x=algs[i], y=algs[j], p=NaN)) } else { j.ss <- results.exp.optimalr[results.exp.optimalr$alg == algs[j],] j.ss.d <- results.exp.overall[results.exp.overall$alg == algs[j],] cmp <- merge(i.ss, j.ss, by=c("exp"), all=TRUE) cmp.d <- merge(i.ss.d, j.ss.d, by=c("exp", "d", "n", "K", "ntrain", "repo", "r.max", "r.alg", "lhat.chance", "lhat.lol"), all=FALSE) rhat.test <- rbind(rhat.test, data.frame(x=algs[j], y=algs[i], p=wilcox.test(cmp$r.alg.x, cmp$r.alg.y, alternative = "less", paired=TRUE)$p.value)) lhat.test <- rbind(lhat.test, data.frame(x=algs[j], y=algs[i], p=wilcox.test(cmp$lhat.alg.x/cmp$lhat.chance.x, cmp$lhat.alg.y/cmp$lhat.chance.y, alternative = "less", paired=TRUE)$p.value)) k.cmp <- c() for (exp in exp_names) { cmp.d.exp <- cmp.d[cmp.d$exp == exp,] k.exp <- mean(cmp.d.exp$lhat.alg.x < cmp.d.exp$lhat.alg.y) if (!is.nan(k.exp)) { k.cmp <- c(k.cmp, k.exp) } } tstat.alt <- sum(k.cmp) } }, error=function(e){NaN}) } } lhat.test$x <- factor(lhat.test$x, levels = algs); lhat.test$y <- factor(lhat.test$y, levels = algs) rhat.test$x <- factor(rhat.test$x, levels = algs); rhat.test$y <- factor(rhat.test$y, levels = algs) rhat.test$p[rhat.test$p < .001] = .001 lhat.test$p[lhat.test$p < .001] = .001 lhat.hmap <- ggplot(lhat.test, aes(x=x, y=y, fill=p)) + geom_tile() + geom_text(aes(label = round(p, 3))) + scale_fill_gradientn(name=TeX("$p$-value"), trans="log", breaks=c(0.001, 0.01, 0.1, 1), colours=rev(c("#f2f0f7", "#cbc9e2", "#9e9ac8", "#6a51a3")), limits=c(0.001, 1)) + ggtitle(TeX("(B) Test of whether $\\hat{L}_{v, j} < $\\hat{L}_{u, j}$")) + xlab("Algorithm u") + ylab("Algorithm v") rhat.hmap <- ggplot(rhat.test, aes(x=x, y=y, fill=p)) + geom_tile() + geom_text(aes(label = round(p, 3))) + scale_fill_gradientn(name=TeX("$p$-value"), trans="log", breaks=c(0.001, 0.01, 0.1, 1), colours=rev(c("#f2f0f7", "#cbc9e2", "#9e9ac8", "#6a51a3")), limits=c(0.001, 1)) + ggtitle(TeX("(C) Test of whether $r_{v, j} < r_{u, j}$")) + xlab("Algorithm u") + ylab("Algorithm v") print(lhat.hmap) print(rhat.hmap)
The normalized misclassification rate per-dataset in a beeswarm plot:
legend.style=guide_legend(ncol=2, byrow=TRUE) ggplot(dat=subset(results.exp.optimalr, alg != "LOL"), aes(x=alg, y=lhat.norm, color=alg, group=alg)) + geom_beeswarm(alpha=0.5, size=2) + xlab("Algorithm") + scale_y_continuous(limits=c(-0.2, .15)) + scale_color_manual(values=acols, guide=legend.style, name="Algorithm") + ylab(TeX("$Mean\\left(\\frac{\\hat{L}_{LOL} - \\hat{L}_{Alg}}{\\hat{L}_{chance}}\\right)$")) + theme_bw()
rs <- plot.normlol.results$r[plot.normlol.results$alg == "PLS"] lhats <- plot.normlol.results$lhat[plot.normlol.results$alg == "PLS"] print(sprintf("Lower-Left (r <= 0 & lhat <= 0): %.3f", sum(rs <= 0 & lhats <= 0)/length(rs))) print(sprintf("Lower-Right (r >= 0 & lhat <= 0): %.3f", sum(rs >= 0 & lhats <= 0)/length(rs))) print(sprintf("Top-Left (r <= 0 & lhat >= 0): %.3f", sum(rs <= 0 & lhats >= 0)/length(rs))) print(sprintf("Top-Right (r >= 0 & lhat >= 0): %.3f", sum(rs >= 0 & lhats >= 0)/length(rs))) print(sprintf("Center (r == 0 & lhat == 0): %.3f", sum(rs == 0 & lhats == 0)/length(rs)))
And the difference between the normalized misclassification of LOL with that of PLS as a function of several different variables:
pls.ss <- subset(results.exp.optimalr, alg == "PLS") pls.ss$lhat.mean <- apply(pls.ss[, c("lhat.alg", "lhat.lol")], 1, function(x) {as.numeric(mean(x[1], x[2], na.rm=TRUE))}) chancep <- ggplot(pls.ss, aes(x=lhat.chance, y=lhat.norm)) + geom_point(size=1.5) + geom_smooth(method = "loess", size = 1.5) + xlab(TeX("$\\hat{L}_{chance}$")) + ylab("") pp <- ggplot(pls.ss, aes(x=d, y=lhat.norm)) + geom_point(size=1.5) + geom_smooth(method = "loess", size = 1.5) + xlab(TeX("p")) + ylab("") + scale_x_continuous(trans=log10_trans()) np <- ggplot(pls.ss, aes(x=n, y=lhat.norm)) + geom_point(size=1.5) + geom_smooth(method = "loess", size = 1.5) + xlab(TeX("n")) + ylab("") + scale_x_continuous(trans=log10_trans()) npp <- ggplot(pls.ss, aes(x=n/d, y=lhat.norm)) + geom_point(size=1.5) + geom_smooth(method = "loess", size = 1.5) + xlab(TeX("n/p")) + ylab("") + scale_x_continuous(trans=log10_trans()) kp <- ggplot(pls.ss, aes(x=K, y=lhat.norm)) + geom_point(size=1.5) + geom_smooth(method = "loess", size = 1.5) + ylab("") + xlab(TeX("K")) mp <- ggplot(pls.ss, aes(x=lhat.mean, y=lhat.norm)) + geom_point(size=1.5) + geom_smooth(method = "loess", size = 1.5) + ylab("") + xlab(TeX("$Mean\\left(\\hat{L}_{LOL}, \\hat{L}_{PLS}\\right)$")) pls.ss$alpha <- ifelse(pls.ss$lhat.norm < 0, 1, 0) pls.ss$alpha <- factor(pls.ss$alpha, levels=c(1, 0)) acols <- c("#00ff00", "#ff0000") names(acols) <- c(1, 0) labs.alpha <- lapply(c("$\\hat{L}_{LOL} < \\hat{L}_{PLS}$", "$\\hat{L}_{LOL} > \\hat{L}_{PLS}$"), TeX) np_diffp <- ggplot(pls.ss, aes(x=n, y=d, color=alpha)) + geom_point(size=1.5) + scale_x_continuous(trans=log10_trans()) + scale_color_manual(values=acols, labels=labs.alpha, name=TeX("Normalized $Mean\\left(\\frac{\\hat{L}_{LOL} - \\hat{L}_{PLS}}{\\hat{L}_{chance}}\\right)$")) layt.mtx <- rbind(c(1, 2, 3, 7, 7, 7), c(4, 5, 6, 7, 7, 7)) grid.arrange(arrangeGrob(chancep, pp, np, npp, kp, mp, np_diffp, left=TeX("$Mean\\left(\\frac{\\hat{L}_{LOL} - \\hat{L}_{PLS}}{\\hat{L}_{chance}}\\right)$"), layout_matrix=layt.mtx))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.