require(lolR) require(ggplot2) require(latex2exp) require(MASS) require(gridExtra) require(data.table) require(reshape2) require(R.matlab) require(grid) require(plyr) classifier.alg = "lda" # compute the cutoff for the particular trial to get an approximate elbow # by computing the smallest r with an associated lhat within 5% # of the global minimum lhat compute_cutoff <- function(rs, lhats, t=0.05) { sr.ix <- sort(rs, decreasing=FALSE, index.return=TRUE)$ix # compute minimum value min.lhat <- min(lhats) # compute minimum value + 5% lhat.thresh <- (1 + t)*min.lhat # find which indices are all below this lhat.below <- which(lhats <= lhat.thresh) rs.below <- rs[lhat.below]; lhats.below <- lhats[lhat.below] tmin.ix <- min(rs.below, index.return=TRUE) return(list(r=rs.below[tmin.ix], lhat=lhats.below[tmin.ix])) } w=.8 h=.2 sim_cov_plot <- function(Sigmas, mus, priors, title="", yl="Dimension", xl="Dimension", ndim=10, nbreaks=4, legend.name=TeX("")) { Sigma <- lolR:::lol.mvr(Sigmas, mus, priors) Sigma <- Sigma[1:ndim, 1:ndim] # subset Sigma <- (Sigma - min(Sigma))/(max(Sigma) - min(Sigma)) labs <- c(1, 10) sdat <- melt(Sigma) plot_cov <- ggplot(sdat, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + ggtitle(title) + xlab(xl) + ylab(yl) + theme_bw() + scale_x_continuous(breaks=labs) + scale_y_reverse(breaks=labs) + theme(legend.position="bottom") +#, axis.title=element_text(size=14)) + theme(plot.margin = unit(c(h,w,h,h), "cm")) + scale_fill_gradientn(name=legend.name, colours=c("#fcfbfd", "#9e9ac8", "#3f007d"), limits=c(0, 1), breaks=c(0.0, 0.5, 1.0))#, #guide=guide_colorbar(title.position="top", title.hjust = .5, barheight=.75)) } mcols <- c("#808080", "#EE7600", "#0e3ec1","#469990") #"#8B4513") names(mcols) <- c("outlier", "1", "2", "3") sim_mean_plot <- function(mus, title="", ylab="Magnitude", xlab="Dimension", ndim=10, nbreaks=4) { dat <- data.frame(mus[1:ndim,]) dat <- cbind(data.frame(1:ndim), dat) K <- dim(mus)[2] ylabs <- sapply(1:K, function(k) as.character(k)) colnames(dat) <- c("Dimension", ylabs) dat <- melt(dat, id="Dimension") xlabs <- c(1, 10) colnames(dat) <- c("Dimension", "Class", "Magnitude") dat$Magnitude = dat$Magnitude/(max(abs(dat$Magnitude)) + .0001) lims <- c(-1, 1) breaks= c(-1, 0, 1) dat$Class <- factor(dat$Class, levels=c(1, 2, 3)) plot_mean <- ggplot(dat, aes(x=Dimension, y=Magnitude, color=Class)) + geom_line(size=1.2) + theme_bw() + ggtitle(title) + xlab(xlab) + ylab(ylab) + scale_y_continuous(limits=lims, breaks=breaks) + scale_x_continuous(breaks=xlabs) + theme(legend.position="bottom") + theme(plot.margin = unit(c(h,w,h,h), "cm")) + scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5)) } plot_sim_lhats <- function(data, cols, pt.dat, linestyle, title="", by=10, from=10, ylab="Misclassification Rate", xlab="# Embedded Dimensions", fsize=12) { lims <- c(floor(10*min(data$lhat))/10, ceiling(10*max(data$lhat))/10) if (unique(data$sim)[1] == "Toeplitz") { length.out=4 } else { length.out=3 } breaks = unique(round(seq(from=lims[1], to=lims[2], length.out = length.out), digits=1)) xlims <- c(min(data$r), max(data$r)) xbreaks <- seq(from=from, to=xlims[2], by=by) plot_sims <- ggplot(data, aes(x=r, y=lhat, linetype=alg, color=alg)) + geom_line(size=.95) + scale_color_manual(values=cols, guide=guide_legend(ncol=4, byrow=TRUE, override.aes=list(shape=NA)), name="Algorithm") + scale_linetype_manual(values=linestyle, guide=guide_legend(ncol=4, byrow=TRUE), name="Algorithm") + #geom_point(data=pt.dat, aes(x=r, y=lhat, color=alg), size=2) + #geom_line(data=base::subset(data, alg == "CCA"), aes(x=r, y=lhat, group=alg, linetype color=alg), size=.75) + #geom_point(data=base::subset(pt.dat, alg == "CCA"), aes(x=r, y=lhat, group=alg, color=alg), size=2) + #geom_line(data=base::subset(data, alg != "CCA" & alg != "QOQ"), aes(x=r, y=lhat, group=alg, color=alg), size=.75) + #geom_point(data=base::subset(pt.dat, alg != "CCA"), aes(x=r, y=lhat, group=alg, color=alg), size=2) + #geom_line(data=base::subset(data, alg == "QOQ"), aes(x=r, y=lhat, group=alg, color=alg), linetype="dashed", size=.75) + xlab(xlab) + ylab(ylab) + ggtitle(title) + theme_bw() + scale_y_continuous(limits=lims, breaks=breaks) + scale_x_continuous(limits=xlims, breaks=xbreaks) + theme(plot.margin = unit(c(h,w,h,h), "cm")) + theme(legend.position="bottom", axis.title.y=element_text(size=fsize)) return(plot_sims) } sim_twod_scatter <- function(X, Y, title="", xlab="", ylab="", nbreaks=4, outlier=NULL, d=c(1, 2)) { reorder <- c() for (ylab in unique(Y)) { reorder <- c(reorder, sample(1:dim(X[Y == ylab,])[1], size=dim(X[Y==ylab,])[1], replace=FALSE)) } if (!is.null(outlier)) { Y[outlier$inlier == FALSE] = "outlier" } X[, d[1]] <- X[, d[1]]/max(abs(X[, d[1]])) X[, d[2]] <- X[, d[2]]/max(abs(X[, d[2]])) xmax <- max(abs(X[,d[1]])); xmin <- -xmax ymax <- max(abs(X[,d[2]])); ymin <- -ymax X <- X[reorder,]; Y <- Y[reorder] df.dat <- data.frame(d1=X[, d[1]], d2=X[, d[2]], class=as.factor(Y)) plot_sims <- ggplot(df.dat, aes(x=d1, y=d2, color=class)) + geom_point(alpha=0.5) + #xlab(paste("Dimension", d[1])) + #ylab(paste("Dimension", d[2])) + xlab(xlab) + ylab(ylab) + ggtitle(title) + theme_bw() + xlim(xmin, xmax) + ylim(ymin, ymax) + theme(legend.position="bottom") + theme(plot.margin = unit(c(h,w,h,h), "cm")) + scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5)) return(plot_sims) } sim_nd_scatter <- function(X, Y, title="", ylab="Magnitude", xlab="Dimension", ndim=10, nbreaks=4) { reorder <- c() for (ylab in unique(Y)) { reorder <- c(reorder, sample(1:dim(X[Y == ylab,])[1], size=dim(X[Y==ylab,])[1]/4, replace=FALSE)) } X <- X[reorder,]; Y <- Y[reorder] xmax <- max(abs(X)); xmin <- -xmax df.dat <- melt(X) names(df.dat) <- c("Point", "Dimension", "Magnitude") df.dat$class <- as.factor(Y[df.dat$Point]) plot_sims <- ggplot(df.dat, aes(x=Dimension, y=Magnitude, color=class)) + geom_point(alpha=0.5) + xlab(xlab) + ylab(ylab) + ylim(xmin, xmax) + ggtitle(title) + theme_bw() + theme(legend.position="bottom") + theme(plot.margin = unit(c(h,w,h,h), "cm")) + scale_color_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5)) return(plot_sims) } sim_d_fill <- function(mus, Sigmas, outlier=NULL, title="", ylab="Magnitude", number="", xlab="Dimension", ndim=10, nbreaks=4) { mu.dat <- melt(mus) d <- dim(mus)[1] names(mu.dat) <- c("Dimension", "Class", "Magnitude") mu.dat$std <- NaN for (i in 1:dim(mus)[2]) { mu.dat$std[mu.dat$Class == i] = sqrt(diag(Sigmas[,,i]))[mu.dat[mu.dat$Class == i,]$Dimension] } if (!is.null(outlier)) { outlier.dat <- data.frame(Dimension=c(), Class=c(), Magnitude=c(), std=c()) for (i in 1:dim(outlier$mu.outlier)[2]) { outlier.dat <- rbind(outlier.dat, data.frame(Dimension=1:d, Class="outlier", Magnitude=outlier$mu.outlier[,i], std=sqrt(diag(outlier$sigma.outlier[,,i])))) } outlier.dat$Magnitude = 0 mu.dat <- rbind(mu.dat, outlier.dat) } mu.dat$Class <- factor(mu.dat$Class, names(mcols)) rescale <- max(abs(mu.dat$Magnitude) + mu.dat$std) mu.dat$Magnitude <- mu.dat$Magnitude/rescale mu.dat$std <- mu.dat$std/rescale plot_sims <- ggplot(mu.dat, aes(x=Dimension, y=Magnitude)) + geom_ribbon(alpha=0.3, aes(fill=Class, ymin=Magnitude + std, ymax=Magnitude - std)) + geom_line(aes(color=Class), size=1.5, alpha=0.9) + xlab(xlab) + # ylab(ylab) + ylab(paste(number, title, sep=" ")) + ggtitle(title) + theme_bw() + theme(legend.position="bottom") + scale_y_continuous(breaks=c(-1, 0, 1)) + theme(plot.margin = unit(c(h,w,h,h), "cm")) + scale_color_manual(values=mcols) +#, guide=guide_legend(title.position="top", title.hjust = .5)) scale_fill_manual(values=mcols)#, guide=guide_legend(title.position="top", title.hjust = .5)) return(plot_sims) } g_legend<-function(a.gplot){ tmp <- ggplot_gtable(ggplot_build(a.gplot)) leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") legend <- tmp$grobs[[leg]] return(legend) }
The below code will produce the required data, which runs LOL, cPCA, PCA, and LR-CCA at the desired simulation settings. Note that this function will multi-thread natively, and took approximately 7 hours to run on a 96 core machine with $\frac{7}{8}$ of the cores active.
source('./sims_driver.R')
Borrowing results from an earlier matlab implementation:
toep <- readMat('./data/sims/toeplitz.mat') tr2 <- readMat('./data/sims/rtrunk.mat') tr3 <- readMat('./data/sims/3trunk.mat') ft <- readMat('./data/sims/fat_tails.mat') qd <- readMat('./data/sims/r2toeplitz.mat') maxr <- c(90, 30, 30, 30, 30) minr <- 0 mats <- list(toep, tr2, tr3, ft, qd) sim_name <- c("Toeplitz", "Trunk-2", "Trunk-3", "Fat-Tails (D=1000)", "QDA") interest <- list(c("ROAD"), c("ROAD"), c("LASSO"), c("ROAD"), c("ROAD")) key <- c("ROAD", "lasso") names(key) <- c("ROAD", "LASSO") resultsm <- data.frame(sim=c(), iter=c(), alg=c(), r=c(), lhat=c()) for (k in 1:length(mats)) { dat <- mats[[k]] desired_r <- 1:maxr[k] for (i in 1:length(dat$ks)) { # i encodes simulation iteration for (j in length(interest[[k]])) { algname <- key[interest[[k]][j]] algid <- which(dimnames(dat$ks[[i]][[1]])[[1]] == algname) rs <- dat$ks[[i]][[1]][algid,,1][[algname]] algid <- which(dimnames(dat$Lhat)[[1]] == algname) lhats <- dat$Lhat[algid,,][[i]] lhat_adjust <- spline(rs, lhats, xout=desired_r, method='fmm', ties=mean) resultsm <- rbind(resultsm, data.frame(sim=sim_name[k], iter=i, alg=interest[[k]][j], r=lhat_adjust$x, lhat=lhat_adjust$y)) } } }
First, we prepare the plots of subsets of the mean and covariance matrices:
# run the simulations once to obtain some basic visualizations n=750 # the simulations to call themselves sims <- list(lol.sims.rtrunk, lol.sims.rev_rtrunk, lol.sims.cross) maxr <- c(30, 30, 30) ds <- c(100, 100, 100) # additional arguments for each simulation scenario opt_args <- list(list(K=3), list(robust=n*.4, maxvar.outlier=1000), list()) #sim_names = c("Trunk-2", "Toeplitz", "Trunk-3", "Fat-Tails (D=1000)", "Robust", "Cross") #sim_titles = c("(A)", "(B)", "(C)", "(D)", "(E)", "(F)") sim_names = c("Trunk-3", "Robust", "Cross") sim_titles = c("(A)", "(B)", "(C)") ndim <- c(10, 10, 10, 10, 10, 10) sim_min <- c(10, 10, 10) by <- c(10, 10, 10) cov_plots <- list() mean_plots <- list() two.scatter <- list() sim.fill <- list() counter <- 1 dims <- list(c(1, 2), c(2, 50), c(50, 51)) sim.dims <- list() for (i in 1:length(sims)) { simn <- do.call(sims[[i]], c(list(n, ds[i]), opt_args[[i]])) cov_plots[[counter]] <- sim_cov_plot(simn$Sigmas, simn$mus, simn$priors, title=sim_names[i]) mean_plots[[counter]] <- sim_mean_plot(simn$mus, title=sim_names[i], ndim=ndim[i]) sim.fill[[counter]] <- sim_d_fill(simn$mus, simn$Sigmas, outlier=simn$robust, title=sim_names[i], number=sim_titles[i]) d <- dims[[i]] sim.dims[[counter]] <- sim_twod_scatter(simn$X, simn$Y, title=paste(sim_titles[i], sim_names[i]), outlier=simn$robust, d=d) counter <- counter + 1 }
Next, we aggregate over the respective iterations, and subset plots for each function:
# read the results in results <- readRDS(paste('../data/sims/lol_sims_', classifier.alg, '.rds', sep="")) #results <- results$overall nan.mean <- function(x) mean(x, na.rm=TRUE) results.means <- aggregate(lhat ~ sim + alg + r + lhat, data = results, FUN = nan.mean) algs <- c("LOL", "RLOL", "QOQ", "PLS", "CCA", "LDA", "PCA", "RP", "ROAD", "LASSO") acols <- c("#f41711", "#f41711", "#f41711", "#94d6c9", "#87c6cc", "#99b4c6", "#020202", "#666666", "#666666", "#666666") linestyle <- c(1,3,3,1,3,1,1,3,1) linestyle <- c("solid", "dotted", "dashed", "dotted", "solid", "dotted", "solid", "solid", "solid", "dotted") names(acols) <- algs names(linestyle) <- algs sim_plots <- list() results.means$alg <- revalue(results.means$alg, c("cPCA"="LDA")) results.means$alg <- revalue(results.means$alg, c("LRLDA"="LDA")) results.means$alg <- revalue(results.means$alg, c("QOL"="QOQ")) results.means$alg <- revalue(results.means$alg, c("LOL"="LOL")) results.means$alg <- revalue(results.means$alg, c("LDA"="LDA")) counter <- 1 for (i in 1:length(sim_names)) { sim <- sim_names[i] data_sub <- results.means[results.means$sim == sim & results.means$alg %in% algs,] pt.dat <- data.frame(x=c(), y=c()) for (alg in unique(data_sub$alg)) { pt <- compute_cutoff(data_sub[data_sub$alg == alg,]$r, data_sub[data_sub$alg == alg,]$lhat) pt.dat <- rbind(pt.dat, data.frame(r=pt$r, lhat=pt$lhat, alg=alg)) } if (i == 2) { pt.dat <- pt.dat[pt.dat$alg != "QOQ",] data_sub <- data_sub[data_sub$alg != "QOQ",] } else if (i == 3) { pt.dat <- pt.dat[pt.dat$alg != "RLOL",] data_sub <- data_sub[data_sub$alg != "RLOL",] } else { pt.dat <- pt.dat[pt.dat$alg != "RLOL" & pt.dat$alg != "QOQ",] data_sub <- data_sub[data_sub$alg != "RLOL" & data_sub$alg != "QOQ",] } sim_plots[[counter]] <- plot_sim_lhats(data_sub, acols, pt.dat, linestyle, ylab=paste(sim_titles[i], sim), from=sim_min[i], by=by[i]) counter <- counter + 1 }
We merge and combine the plots:
nsim <- length(sim_names) fsize=12 sim_leg.plot <- ggplot(data=subset(results.means, !(alg %in% c("ROAD", "LASSO"))), aes(x=r, y=lhat, color=alg, linetype=alg)) + geom_line(size=1.2) + scale_color_manual(values=acols, guide=guide_legend(ncol=2, byrow=FALSE), name="Algorithm") + scale_linetype_manual(values=linestyle, guide=guide_legend(ncol=2, byrow=FALSE), name="Algorithm") + theme(legend.position="bottom", axis.title.y=element_text(size=fsize)) + theme_bw() sim_leg <- g_legend(sim_leg.plot) cov_leg <- g_legend(cov_plots[[1]]) mean_leg <- g_legend(mean_plots[[3]]) dummydat <- data.frame(Class=sample(names(mcols), size=50, replace=TRUE), Dimension=1:50, Magnitude=0, std=2) simfill_leg.plot <- ggplot(dummydat, aes(x=Dimension, y=Magnitude, color=Class, fill=Class)) + geom_line(size=1.5) + geom_ribbon(aes(ymin=Magnitude -std, ymax=Magnitude + std), alpha=0.3) + scale_color_manual(values=mcols, guide=guide_legend(nrow=4, byrow=TRUE)) + scale_fill_manual(values=mcols, guide=guide_legend(nrow=4, byrow=TRUE)) simfill_leg <- g_legend(simfill_leg.plot) # remove the legends from the plots sim_plots <- sapply(1:length(sim_plots), function(j) { resp <- sim_plots[[j]] + ggtitle("") +theme(legend.position=NaN) # remove the ylabel of only the non-left most columns if (j != 1) { resp <- resp + xlab("") + ylab("") + theme(axis.text.x=element_text(color="#FFFFFF")) } else { resp <- resp + ylab("Error") } return(resp) }, simplify=FALSE) mean_plots <- sapply(1:length(mean_plots), function(j) { resp <- mean_plots[[j]] + ggtitle("") + theme(legend.position=NaN) # remove the ylabel of only the non-left most columns if (j != 1) { resp <- resp + xlab("") + ylab("") } return(resp) }, simplify=FALSE) cov_plots <- sapply(1:length(cov_plots), function(j) { resp <- cov_plots[[j]] + ggtitle("") + theme(legend.position=NaN) # remove the ylabel of only the non-left most columns if (j != 1) { resp <- resp + xlab("") + ylab("") } return(resp) }, simplify=FALSE) sim.fill <- sapply(1:length(sim.fill), function(j) { resp <- sim.fill[[j]] + ggtitle("") + theme(legend.position=NaN) # remove the ylabel of only the non-left most columns if (j != 1) { resp <- resp + xlab("") +# ylab("") + theme(axis.text.x=element_text(color="#FFFFFF"), axis.text.y=element_text(color="#FFFFFF")) } return(resp) }, simplify=FALSE) sim.dims <- sapply(1:length(sim.dims), function(j) { resp <- sim.dims[[j]] + theme(legend.position=NaN) # remove the ylabel of only the non-left most columns if (j != 0) { resp <- resp + xlab("") + ylab("") } else { resp <- resp + xlab("") + ylab("") } return(resp) }, simplify=FALSE) tfonts = 14 grid_sim <- grid.arrange(grid.arrange(grobs=sim_plots, ncol=nsim), sim_leg, nrow=1, widths=c(.95, .18) ) grid_mean <- grid.arrange(grid.arrange(grobs=mean_plots, ncol=nsim), mean_leg, nrow=2, heights=c(.95, .09), top=textGrob("Means\n(First 10 Dimensions)", gp=gpar(fontsize=tfonts, face="bold"))) grid_cov <- grid.arrange(grid.arrange(grobs=cov_plots, ncol=nsim), cov_leg, nrow=2, heights=c(.95, .09), top=textGrob("Covariances\n(First 10 Dimensions)", gp=gpar(fontsize=tfonts, face="bold"))) grid_simfill <- grid.arrange(grid.arrange(grobs=sim.fill, ncol=nsim), simfill_leg, nrow=2, heights=c(.95, .09), top=textGrob("Generative Model\n(p=100)", gp=gpar(fontsize=tfonts, face="bold"))) grid_simdim <- grid.arrange(grid.arrange(grobs=sim.dims, ncol=nsim), simfill_leg, nrow=1, widths=c(.95, .18) )
We combine and plot:
grid.arrange(grid_simdim, grid_sim, nrow=2, heights=c(0.35, 0.35))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.