knitr::opts_chunk$set(fig.width = 8, fig.height = 5, fig.align = "center", message = TRUE, warning = TRUE, echo = TRUE)
library(magrittr) library(data.table) library(ggplot2) library(cowplot) library(xtable) library(bst235Project)
res_df <- read.csv("../data/simdata.csv") %>% tidyr::gather(coef, value, -n, -rho1, -rho12, -rho2, -x_dist, -sd) %>% data.table() errors_df <- res_df[startsWith(coef, "error_"), ] %>% dplyr::rename(error = coef) betas_df <- res_df[!startsWith(coef, "error_"), ]
betas_zeros <- betas_df[coef != "alpha" & coef != "beta6" & coef != "beta7" & coef != "beta8" & coef != "beta9", mean(value < 1e-8), by = .(coef, n, rho1, rho12, rho2, x_dist, sd)] %>% dplyr::rename(prop_zeros = V1)#%>% #xtable(digits = 3) %>% print(comment = FALSE) betas_nonzeros <- betas_df[coef == "beta6" | coef == "beta7" | coef == "beta8" | coef == "beta9", mean(abs(value) > 1e-8), by = .(coef, n, rho1, rho12, rho2, x_dist, sd)] %>% dplyr::rename(prop_nonzeros = V1)#%>% #xtable(digits = 3) %>% print(comment = FALSE) hist(betas_zeros$prop_zeros, xlab = "Proportion of Zero Betas = 0", main = "Proportion of Zero Betas = 0") all(betas_zeros[n == 5000 & sd == 0.01, prop_zeros] == 1) all(betas_nonzeros[n == 5000 & sd == 0.01, prop_nonzeros] == 1) betas_zeros[n == 5000 & sd == 0.3, ] %>% ggplot(aes(x = coef, y = prop_zeros, color = x_dist)) + geom_boxplot() + geom_point(position = position_dodge(width = 0.75), aes(group = x_dist), alpha = 0.5) + xlab("Coefficient") + ylab("Proportion Zero") + scale_color_discrete(name = "Covariate Distribution") + ggtitle("Proportion of Zeros Among True Zero Coefficients, n = 5000, sd = 0.3")
hist(betas_nonzeros$prop_nonzeros, xlab = "Proportion of Nonzero Betas != 0", main = "Proportion of Nonzero Betas != 0") hist(betas_nonzeros[n == 5000 & sd == 0.3, prop_nonzeros], yaxt = 'n', main = "Prop. of Nonzeros Among True Nonzero Coefficients, n = 5000, sd = 0.3", xlab = "Proportion of Nonzeros") axis(side = 2, at = seq(0, 50, 2), labels = seq(0, 50, 2)) betas_nonzeros[n == 5000 & sd == 0.3, ]
# zero betas betas_df[coef != "alpha" & coef != "beta6" & coef != "beta7" & coef != "beta8" & coef != "beta9", ] %>% ggplot(aes(x = as.factor(n), y = value, color = interaction(rho1, rho12, rho2, sep = ", "))) + geom_violin() + facet_grid(x_dist ~ as.factor(sd) + coef) + scale_color_discrete(name = "(rho1, rho12, rho2)") + xlab("n") + ylab("Coefficient Value") + ggtitle("Zero Betas") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# nonzero betas betas_df[coef == "beta6", ratio := value / -0.3] betas_df[coef == "beta7", ratio := value / -0.1] betas_df[coef == "beta8", ratio := value / 0.1] betas_df[coef == "beta9", ratio := value / 0.3] betas_df[coef == "beta6" | coef == "beta7" | coef == "beta8" | coef == "beta9", ] %>% ggplot(aes(x = as.factor(n), y = ratio, color = interaction(rho1, rho12, rho2, sep = ", "))) + geom_boxplot() + facet_grid(x_dist ~ as.factor(sd) + coef) + scale_color_discrete(name = "(rho1, rho12, rho2)") + xlab("n") + ylab("Coefficient Value / True Value") + ggtitle("Nonzero Betas") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + geom_hline(aes(yintercept = 0))
selected_betas <- betas_df[n == 5000 & sd > 0.1 & rho1 == 0.3 & rho2 == 0.3 & coef == "beta6" | coef == "beta7" | coef == "beta8" | coef == "beta9", ] selected_betas[, mean(ratio), by = .(x_dist, coef)] %>% xtable(digits = 3) ggplot(selected_betas, aes(x = x_dist, y = ratio, color = x_dist)) + geom_boxplot() + facet_grid(~coef) + xlab("Distribution of X") + ylab("Coefficient Value / True Value") + guides(color = FALSE) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("True Nonzero Betas; n = 5000, sigma = 0.3, rho1 = rho12 = rho2 = 0.3")
# alpha betas_df[coef == "alpha", ratio := value / 3] betas_df[coef == "alpha", ] %>% ggplot(aes(x = as.factor(n), y = ratio, color = interaction(rho1, rho12, rho2, sep = ", "))) + geom_boxplot() + facet_grid(as.factor(sd) ~ x_dist) + scale_color_discrete(name = "(rho1, rho12, rho2)") + xlab("n") + ylab("Coefficient Value / True Value") + ggtitle("alpha")
p_small <- errors_df[sd == 0.01 & rho1 == 0.3 & rho2 == 0.3,] %>% ggplot(aes(y = value, color = error, x = as.factor(n))) + geom_boxplot() + facet_grid(~ x_dist) + xlab("n") + ylab("MSE") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Error Comparison, Small Errors, rho1 = rho12 = rho3 = 0.3") p_large <- errors_df[sd > 0.1 & rho1 == 0.3 & rho2 == 0.3,] %>% ggplot(aes(y = value, color = error, x = as.factor(n))) + geom_boxplot() + facet_grid(~ x_dist) + xlab("n") + ylab("MSE") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Error Comparison, Big Errors, rho1 = rho12 = rho3 = 0.3") plot_grid(p_small, p_large, nrow = 2)
errors_df[sd == 0.01,] %>% ggplot(aes(y = value, color = error, x = interaction(rho1, rho12, rho2, sep = ", "))) + geom_boxplot() + facet_grid(x_dist ~ n) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("(rho1, rho12, rho2)") + ylab("MSE") + ggtitle("Error Comparison, Small Errors") errors_df[sd == 0.3,] %>% ggplot(aes(y = value, color = error, x = interaction(rho1, rho12, rho2, sep = ", "))) + geom_boxplot() + facet_grid(x_dist ~ n) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("(rho1, rho12, rho2)") + ylab("MSE") + ggtitle("Error Comparison, Big Errors")
betas <- c(3, 0, 0, 0, 0, 0, -0.3, -0.1, 0.1, 0.3) corr_mat <- matrix(0.3, nrow = 9, ncol = 9) diag(corr_mat) <- 1 n <- 1000 x_simulator2 <- function(n) { return(mvtnorm::rmvnorm(n, sigma = corr_mat)) } test_simulation <- function(sd) { error_simulator <- function(n) { rnorm(n, mean = 0, sd = sd) } test_dat <- bst235Project:::sim_data(betas, x_simulator2, error_simulator, n = 10000) train_dat <- bst235Project:::sim_data(betas, x_simulator2, error_simulator, n) alasso_cv <- bst235Project:::adaptive_lasso(train_dat$xs, train_dat$ys) # fit true model true_mod_betas <- bst235Project:::fit_true_model(train_dat$xs, train_dat$ys) # get predictions true_condmean <- betas[1] + pnorm(test_dat$xs %*% betas[-1]) true_preds <- true_mod_betas[1] + pnorm(test_dat$xs%*% true_mod_betas[-1]) naive_preds <- predict(alasso_cv, newx = test_dat$xs, s = "lambda.min") # do calibration predictions train_yhat <- predict(alasso_cv, newx = train_dat$xs, s = "lambda.min") best_h <- bst235Project:::select_bandwidth(train_yhat, train_dat$ys, nfolds = 10, cv = FALSE) calibrate_preds <- bst235Project:::np_calibrate(test_dat$xs, train_dat$xs, train_dat$ys, alasso_cv, bandwidth = best_h) ret <- list(test_ys = test_dat$ys, true_condmean = true_condmean, true_preds = true_preds, naive_preds = naive_preds, calibrate_preds = calibrate_preds, train_yhat = train_yhat, train_ys = train_dat$ys) return(ret) } plot_hists <- function(testsim_obj) { par(mfrow = c(2, 2)) xlims <- range(testsim_obj$test_ys) hist(testsim_obj$test_ys, breaks = 50, xlim = xlims, main = "True Y", xlab = "True Y") hist(testsim_obj$true_preds, breaks = 50, xlim = xlims, main = "True Model Predictions", xlab = "True Model Predictions") hist(testsim_obj$naive_preds, breaks = 50, xlim = xlims, main = "ALASSO Predictions", xlab = "ALASSO Predictions") hist(testsim_obj$calibrate_preds, breaks = 50, xlim = xlims, main = "Calibrated Predictions", xlab = "ALASSO Predictions") } set.seed(1262) testsim_smallerr <- test_simulation(sd = 0.01) testsim_bigerr <- test_simulation(sd = 0.3) testsim_lst <- list(testsim_smallerr, testsim_bigerr) title_lst <- c("Small Error", "Big Error")
plot_hists(testsim_smallerr) plot_hists(testsim_bigerr)
par(mfrow = c(2, 2)) for (i in 1:2) { plot(testsim_lst[[i]]$train_yhat, testsim_lst[[i]]$train_ys, pch = 20, col = rgb(0, 0, 0, 0.3), xlab = "Train Y_hat (ALASSO)", ylab = "Train Y", main = paste0("Y vs. Y_hat, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) plot(testsim_lst[[i]]$train_yhat, testsim_lst[[i]]$train_ys - testsim_lst[[i]]$train_yhat, pch = 20, col = rgb(0, 0, 0, 0.3), xlab = "Train Y_hat (ALASSO)", ylab = "Train Residual", main = paste0("Residual, ", title_lst[i])) }
par(mfcol = c(1, 2)) for (i in 1:2) { naive_preds <- testsim_lst[[i]]$naive_preds test_ys <- testsim_lst[[i]]$test_ys calibrate_preds <- testsim_lst[[i]]$calibrate_preds plot(naive_preds, test_ys, pch = 20, col = rgb(0, 0, 0, 0.3), xlab = "ALASSO prediction on test", ylab = "True test Y", main = paste0("Kernel Predictions, ", title_lst[i])) lines(naive_preds[order(naive_preds)], calibrate_preds[order(naive_preds)], col = "red", lwd = 2) }
par(mfrow = c(2, 3)) for (i in 1:2) { plt_lims <- range(testsim_lst[[i]]$test_ys) plot(testsim_lst[[i]]$test_ys, testsim_lst[[i]]$true_preds, ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2), xlab = "True test Y", ylab = "True model prediction", main = paste0("True Model, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) plot(testsim_lst[[i]]$test_ys, testsim_lst[[i]]$naive_preds, ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2), xlab = "True test Y", ylab = "ALASSO prediction", main = paste0("ALASSO, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) plot(testsim_lst[[i]]$test_ys, testsim_lst[[i]]$calibrate_preds, ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2), xlab = "True test Y", ylab = "Kernel prediction", main = paste0("Kernel, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) }
par(mfrow = c(2, 3)) for (i in 1:2) { plt_lims <- range(testsim_lst[[i]]$true_condmean) plot(testsim_lst[[i]]$true_condmean, testsim_lst[[i]]$true_preds, ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2), xlab = "True Conditional Mean", ylab = "True model prediction", main = paste0("True Model, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) plot(testsim_lst[[i]]$true_condmean, testsim_lst[[i]]$naive_preds, ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2), xlab = "True Conditional Mean", ylab = "ALASSO prediction", main = paste0("ALASSO, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) plot(testsim_lst[[i]]$true_condmean, testsim_lst[[i]]$calibrate_preds, ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2), xlab = "True Conditional Mean", ylab = "Kernel prediction", main = paste0("Kernel, ", title_lst[i])) abline(a = 0, b = 1, col = "red", lwd = 2) }
error_simulator <- function(n) { rnorm(n, sd = 0.3) } n <- 500 system.time({ set.seed(1262) sim_res_cv <- link_viol_sim(1000, betas, x_simulator2, n, error_simulator = error_simulator, testsize = 10000, cv = TRUE) }) system.time({ set.seed(1262) sim_res_nocv <- link_viol_sim(1000, betas, x_simulator2, n, error_simulator = error_simulator, testsize = 10000, cv = FALSE) })
Boxes in gray with CV, boxes in white without CV. Can see errors are the same.
boxplot(cbind(sim_res_cv$errors, sim_res_nocv$errors), col = c(rep("grey", 3), rep("white", 3)), xlab = "Prediction Error", ylab = "MSE", main = "Errors with/without CV") print("Median errors with CV:") apply(sim_res_cv$errors, 2, median) print("Median errors without CV:") apply(sim_res_nocv$errors, 2, median)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.