knitr::opts_chunk$set(echo = TRUE)
require(ggplot2) require(reshape2) require(grid) require(gridExtra) require(mgc) require(ICC) require(I2C2) require(cowplot) require(lolR) require(plyr) require(scales) require(dplyr) require(latex2exp) require(MASS)
g_legend<-function(a.gplot){ tryCatch({ tmp <- ggplot_gtable(ggplot_build(a.gplot)) leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") legend <- tmp$grobs[[leg]] return(legend) }, error=function(e) {return(ggplot() + theme_void())}) }
source('./shared_scripts.R') sim.no_signal <- function(n=128, d=2, sigma=1) { # classes are from same distribution, so signal should be detected w.p. alpha samp <- sim_gmm(mus=cbind(rep(0, d), rep(0,d)), Sigmas=abind(diag(d), diag(d), along=3), n, priors=c(0.5,0.5)) samp$X=samp$X + array(rnorm(n*d), dim=c(n, d))*sigma return(list(X=samp$X, Y=samp$Y, Z=samp$Y)) } ## Samples from Multiclass Gaussians # a simulation where there are multiple classes present, and a correlation structure # 2 classes sim.multiclass_gaussian <- function(n, d, K=16, sigma=0) { # centers for the individuals mu.class <- rep(0, d) S.class <- diag(d)*sqrt(K) # sample the individual centers mus.class <- t(mvrnorm(n=K, mu.class, S.class)) # individual covariances S.k <- diag(d) S.k[upper.tri(S.k)] <- 0.5 # correlated S.k[lower.tri(S.k)] <- 0.5 Sigmas <- abind(lapply(1:K, function(k) S.k), along=3) # sample individuals w.p. 1/K samp <- sim_gmm(mus=mus.class, Sigmas=Sigmas, n, priors=rep(1/K, K)) samp$X=samp$X + array(rnorm(n*d)*sigma, dim=c(n, d)) mus.z <- as.numeric(sapply(1:K, function(k) { return(mus.class[1,k] <= 0) })) + 1 Z <- mus.z[samp$Y] return(list(X=samp$X, Y=samp$Y, Z=Z)) } sim.crossed_sig2 <- function(n=128, d=2, sigma=0) { # class mus K=2 mu.class <- rep(0, d) S.class <- diag(d)*sqrt(K) mus.class <- t(mvrnorm(n=K, mu.class, S.class)) # crossed signal Sigma.1 <- cbind(c(2,0), c(0,0.1)) Sigma.2 <- cbind(c(0.1,0), c(0,2)) # covariances are orthogonal mus=cbind(rep(0, d), rep(0, d)) rho <- runif(1, min=-.2, max=.2) # add random correlation Sigmas <- abind(Sigma.1, Sigma.2, along = 3) Sigmas[1,2,1] <- Sigmas[2,1,1] <- rho Sigmas[1,2,2] <- Sigmas[2,1,2] <- -rho # sample from crossed gaussians w p=0.5, 0.5 respectively sim <- sim_gmm(mus=cbind(rep(0, d), rep(0, d)), Sigmas=Sigmas, n, priors=c(0.5, 0.5)) X <- sim$X + array(rnorm(n*d)*sigma, dim=c(n, d)) Y <- sim$Y return(list(X=X, Y=Y, Z=Y)) } # 8 pairs of annulus/discs sim.multiclass_ann_disc2 <- function(n, d, sigma=0) { mus <- cbind(c(0, 0)) # probability of being each individual is 1/K ni <- rowSums(rmultinom(n, 1, prob=rep(1/2, 2))) X <- array(NaN, dim=c(n, d)) X[1:ni[1],] <- sweep(mgc.sims.2ball(ni[1], d, r=1, cov.scale=0.1), 2, mus[,1], "+") X[(ni[1] + 1):n,] <- sweep(mgc.sims.2sphere(ni[2], r=1.5, d=d, cov.scale=0.1), 2, mus[,1], "+") X <- X + array(rnorm(n*d)*sigma, dim=c(n, d)) Y <- c(rep(1, ni[1]), rep(2, ni[2])) return(list(X=X, Y=Y, Z=Y)) } sim.xor2 <- function(n, d, sigma=0) { mus <- cbind(c(0, 0), c(1,1), c(1, 0), c(0, 1)) Y <- rep(1:ncol(mus), n/ncol(mus)) X <- mvrnorm(n=n, mu=c(0, 0), Sigma=sigma*diag(d)) + t(mus[,Y]) ordx.Y <- sort(Y, index.return=TRUE, decreasing=FALSE)$ix Y <- floor((Y-1)/2) return(list(X=X[ordx.Y,], Y=Y[ordx.Y] + 1, Z=Y[ordx.Y] + 1)) }
n <- 256; d <- 2 simulations <- list(sim.multiclass_gaussian, sim.crossed_sig2, sim.multiclass_ann_disc2, sim.xor2, sim.no_signal) names(simulations) <- c("(i) Gaussian", "(ii) Cross", "(iii) Ball/Circle", "(iv) XOR", "(v) No Signal") colors <- c('#e6194b','#4363d8', '#ffe119', '#3cb44b', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff') sim.dat <- lapply(names(simulations), function(sim.name) { print(sim.name) set.seed(1234) if (sim.name == "(iv) XOR") { sim <- do.call(simulations[[sim.name]], list(n=n, d=d, sigma=0.05)) } else { sim <- do.call(simulations[[sim.name]], list(n=n, d=d, sigma=0)) } if (sim.name == "(ii) Cross") { # sort cross distance matrix within-spoke for (class in unique(sim$Y)) { if (class == 1) { samp.y1 <- which(sim$Y == class) idx.y1 <- sort(sim$X[samp.y1,1], decreasing = TRUE, index.return=TRUE)$ix samp.y1 <- samp.y1[idx.y1] } else { samp.y2 <- which(sim$Y == class) idx.y2 <- sort(sim$X[samp.y2,2], decreasing = TRUE, index.return=TRUE)$ix samp.y2 <- samp.y2[idx.y2] } } samp.order <- c(samp.y1, samp.y2) sim$X <- sim$X[samp.order,]; sim$Y <- sim$Y[samp.order] } else if (sim.name == "(iii) Ball/Circle") { # sort Annulus/Disc by spherical angle for (class in unique(sim$Y)) { if (class == 1) { samp.y1 <- which(sim$Y == class) idx.y1 <- sort(apply(sim$X[samp.y1,], 1, function(x) atan(x[2]/x[1])), decreasing = TRUE, index.return=TRUE)$ix samp.y1 <- samp.y1[idx.y1] } else { samp.y2 <- which(sim$Y == class) idx.y2 <- sort(apply(sim$X[samp.y2,], 1, function(x) atan(x[2]/x[1])), decreasing = TRUE, index.return=TRUE)$ix samp.y2 <- samp.y2[idx.y2] } } samp.order <- c(samp.y1, samp.y2) sim$X <- sim$X[samp.order,]; sim$Y <- sim$Y[samp.order] } dmtx <- melt(mgc.distance(sim$X)) dmtx$value <- dmtx$value^2 dmtx$value <- (dmtx$value - min(dmtx$value))/(max(dmtx$value) - min(dmtx$value)) dmtx$sim.name <- sim.name return(list(data=data.frame(sim.name=sim.name, x1=(sim$X[,1] - min(sim$X[,1]))/(max(sim$X[,1]) - min(sim$X[,1])), x2=(sim$X[,2] - min(sim$X[,2]))/(max(sim$X[,2]) - min(sim$X[,2])), y=sim$Y, z=sim$Z), dmtx=dmtx)) }) sim.plots <- do.call(rbind, lapply(sim.dat, function(sim) sim$data)) %>% mutate(sim.name=factor(sim.name, levels=names(simulations), ordered=TRUE), y=factor(y), z=factor(z), plot.name="(A) Sample Data") %>% ggplot(aes(x=x1, y=x2, color=y, shape=z)) + geom_point() + facet_grid(sim.name ~ plot.name, scales="free", switch="y") + xlab("Dimension 1") + ylab("Dimension 2") + scale_color_manual(values=colors, guide=FALSE) + theme_bw() + scale_shape_discrete(name="Class") + theme(axis.text=element_text(color="#FFFFFF"), axis.ticks=element_blank(), legend.position="bottom") dmtx.plots <- do.call(rbind, lapply(sim.dat, function(sim) sim$dmtx)) %>% mutate(sim.name=factor(sim.name, levels=names(simulations), ordered=TRUE), plot.name="(B) Distance Matrix") %>% ggplot(aes(x=Var1, y=Var2, fill=sqrt(value))) + geom_tile() + xlab("Sample 1") + ylab("Sample 2") + scale_x_continuous(labels=c(1, 128), breaks=c(1, 256), expand=c(0,0)) + scale_y_continuous(labels=c(1, 128), breaks=c(1, 256), expand=c(0,0)) + theme_bw() + scale_fill_gradient(name="Distance", low="#FFFFFF", high="#9900FF", labels=c(0, 1), breaks=c(0, 1)) + facet_grid(sim.name ~ plot.name, scales="free") + theme(strip.background.y = element_blank(), strip.text.y=element_blank(), legend.position="bottom")
al=0.05 # alpha for the power testing sample.test.results <- readRDS('../data/sims/discr_sims_os.rds')$os.results %>% dplyr::filter(stat.name != "Kernel" & !(stat.name == "MMD" & sim.name == "Gaussian") & !(stat.name == "HSIC" & sim.name != "Gaussian") & !(stat.name == "DISCO")) %>% dplyr::mutate(stat.name=recode_factor(stat.name, "HSIC"="Kernel", "MMD"="Kernel")) %>% dplyr::mutate(test.name="(D) One Sample Test") %>% rbind( readRDS('../data/sims/discr_sims_ts.rds')$ts.results %>% dplyr::filter(stat.name != "Kernel" & !(stat.name == "MMD" & sim.name == "Gaussian") & !(stat.name == "HSIC" & sim.name != "Gaussian") & !(stat.name == "DISCO")) %>% dplyr::mutate(stat.name=recode_factor(stat.name, "HSIC"="Kernel", "MMD"="Kernel")) %>% dplyr::mutate(test.name="(E) Two Sample Test") ) %>% dplyr::mutate(outcome=p.value < al) %>% dplyr::select(sim.name, n, d, i, stat.name, sigma, outcome, test.name) %>% rbind( readRDS('../data/sims/discr_sims_bound.rds')$bound.results %>% dplyr::mutate(test.name="(C) Comparison to Error") %>% dplyr::rename(stat.name=algorithm, outcome=value) %>% dplyr::filter(stat.name != "Kernel" & !(stat.name == "MMD" & sim.name == "Gaussian") & !(stat.name == "HSIC" & sim.name != "Gaussian") & !(stat.name == "DISCO")) %>% dplyr::mutate(stat.name=recode_factor(stat.name, "HSIC"="Kernel", "MMD"="Kernel")) %>% dplyr::mutate(outcome=round(outcome, digits=2)) %>% dplyr::mutate(outcome=ifelse(stat.name=="bayes", 1-outcome, outcome), sim.name=recode_factor(sim.name, "Annulus/Disc"="Ball/Circle")) ) %>% dplyr::rename("Simulation"=sim.name, "Statistic"=stat.name, "Test"=test.name) %>% dplyr::group_by(n, d, Statistic, Simulation, sigma, Test) %>% dplyr::summarise(Value=mean(outcome)) %>% dplyr::filter(Simulation %in% c("No Signal", "Cross", "Gaussian", "Ball/Circle", "XOR")) %>% dplyr::group_by(n, d, Statistic, Simulation, Test) %>% dplyr::mutate(sigma.wt=(sigma - min(sigma))/(max(sigma) - min(sigma))) %>% dplyr::ungroup() %>% dplyr::mutate( Statistic=recode_factor(Statistic, "Discr"="Discr.", "FPI"="Finger.", "bayes"="Bayes Accuracy"), Simulation=recode_factor(Simulation, "No Signal" = "(v) No Signal", "Gaussian" = "(i) Gaussian", "Cross" = "(ii) Cross", "Ball/Circle"="(iii) Ball/Circle", "XOR"="(iv) XOR") ) %>% dplyr::mutate(Test=factor(Test), Simulation=factor(Simulation, levels=names(simulations), ordered=TRUE), Statistic=factor(Statistic, levels=c("Discr.", "PICC", "I2C2", "Finger.", "Kernel", "Bayes Accuracy"), ordered=TRUE)) alg.colors <- c("#c70000", "#a65628", "#984ea3", "#ff8c00", "#377eb8", "#4daf4a") line.types <- c(1, 1, 1, 1, 1, 1) names(alg.colors) <- names(line.types) <- c("Discr.", "PICC", "Finger.", "I2C2", "Kernel", "Bayes Accuracy")
bound.plot <- sample.test.results %>% dplyr::filter(Test == "(C) Comparison to Error") %>% dplyr::group_by(n, d, Statistic, Simulation, Test) %>% dplyr::mutate(stat.wt=(Value - min(Value))/(max(.02, max(c(Value)) - min(Value)))) %>% ggplot(aes(x=sigma.wt, y=stat.wt, color=Statistic)) + geom_line() + scale_color_manual(values=alg.colors, name="Statistic") + facet_grid(Simulation ~ Test, scales="free_x") + theme_bw() + scale_y_continuous(limits=c(-.05, 1), breaks=c(0, 1)) +#, expand=c(0, 0)) + scale_x_continuous(limits=c(0, 1), breaks=c(0, 1)) + #geom_hline(yintercept=0.05, color="black") + xlab("Normalized Variance") + ylab("Normalized Statistic") + theme(strip.background.y = element_blank(), strip.text.y=element_blank(), legend.position="bottom") + guides(color=guide_legend(nrow=2,byrow=TRUE)) sample.plot <- sample.test.results %>% dplyr::filter(Test != "(C) Comparison to Error") %>% ggplot(aes(x=sigma.wt, y=Value, color=Statistic)) + geom_line() + scale_color_manual(values=alg.colors, name="Statistic") + facet_grid(Simulation ~ Test, scales="free_x") + theme_bw() + scale_y_continuous(limits=c(-.05, 1), breaks=c(0, 1)) +#, expand=c(0, 0)) + scale_x_continuous(limits=c(0, 1), breaks=c(0, 1)) + geom_hline(yintercept=0.05, color="black") + xlab("Normalized Variance") + ylab("Statistical Power") + theme(strip.background.y = element_blank(), strip.text.y=element_blank(), legend.position="none")
9.3 x 6.66
grid.arrange(arrangeGrob(sim.plots + theme(legend.position="none"), dmtx.plots + theme(legend.position="none"), widths=c(1,.9), nrow=1), arrangeGrob(g_legend(sim.plots), g_legend(dmtx.plots), widths=c(1.1, 1)), nrow=2, heights=c(.8,.1)) grid.arrange(arrangeGrob(bound.plot + theme(legend.position="none"), sample.plot + theme(legend.position="none"), widths=c(1.1, 1.9)), g_legend(bound.plot), nrow=2, heights=c(.8, .1)) grid.arrange(arrangeGrob(sim.plots + theme(legend.position="none"), dmtx.plots + theme(legend.position="none"), bound.plot + theme(legend.position="none"), sample.plot + theme(legend.position="none"), widths=c(1, 0.9, 1.1, 1.9)), arrangeGrob(g_legend(sim.plots), g_legend(dmtx.plots), g_legend(bound.plot), widths=c(1.1, 1, 3)), nrow=2, heights=c(0.8, 0.1))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.