In shiandy/bst235project:

knitr::opts_chunk$set(fig.width = 8, fig.height = 5,
                      fig.align = "center", message = TRUE,
                      warning = TRUE, echo = TRUE)

library(magrittr)
library(data.table)
library(ggplot2)
library(cowplot)
library(xtable)
library(bst235Project)

Methods

Results

res_df <- read.csv("../data/simdata.csv") %>%
    tidyr::gather(coef, value, -n, -rho1, -rho12, -rho2, -x_dist, -sd) %>%
    data.table()

errors_df <- res_df[startsWith(coef, "error_"), ] %>%
    dplyr::rename(error = coef)
betas_df <- res_df[!startsWith(coef, "error_"), ]

Betas

betas_zeros <- betas_df[coef != "alpha" & coef != "beta6" & coef !=
                      "beta7" & coef != "beta8" & coef != "beta9",
                  mean(value < 1e-8), by = .(coef, n, rho1, rho12, rho2,
                                             x_dist, sd)] %>%
    dplyr::rename(prop_zeros = V1)#%>%
    #xtable(digits = 3) %>% print(comment = FALSE)

betas_nonzeros <- betas_df[coef == "beta6" | coef == "beta7" | coef == "beta8" | coef == "beta9", mean(abs(value) > 1e-8), by = .(coef, n, rho1, rho12, rho2, x_dist, sd)] %>%
    dplyr::rename(prop_nonzeros = V1)#%>%
    #xtable(digits = 3) %>% print(comment = FALSE)

hist(betas_zeros$prop_zeros, xlab = "Proportion of Zero Betas = 0",
     main = "Proportion of Zero Betas = 0")
all(betas_zeros[n == 5000 & sd == 0.01, prop_zeros] == 1)
all(betas_nonzeros[n == 5000 & sd == 0.01, prop_nonzeros] == 1)
betas_zeros[n == 5000 & sd == 0.3, ] %>%
    ggplot(aes(x = coef, y = prop_zeros, color = x_dist)) +
    geom_boxplot() +
    geom_point(position = position_dodge(width = 0.75),
               aes(group = x_dist), alpha = 0.5) +
    xlab("Coefficient") + ylab("Proportion Zero") +
    scale_color_discrete(name = "Covariate Distribution") +
    ggtitle("Proportion of Zeros Among True Zero Coefficients, n = 5000, sd = 0.3")

hist(betas_nonzeros$prop_nonzeros,
     xlab = "Proportion of Nonzero Betas != 0",
     main = "Proportion of Nonzero Betas != 0")

hist(betas_nonzeros[n == 5000 & sd == 0.3, prop_nonzeros], yaxt = 'n',
     main = "Prop. of Nonzeros Among True Nonzero Coefficients, n = 5000, sd = 0.3",
     xlab = "Proportion of Nonzeros")
axis(side = 2, at = seq(0, 50, 2), labels = seq(0, 50, 2))

betas_nonzeros[n == 5000 & sd == 0.3, ]

# zero betas
betas_df[coef != "alpha" & coef != "beta6" & coef != "beta7" & coef != "beta8" & coef != "beta9", ] %>%
    ggplot(aes(x = as.factor(n), y = value,
               color = interaction(rho1, rho12, rho2, sep = ", "))) +
    geom_violin() + facet_grid(x_dist ~ as.factor(sd) + coef) +
    scale_color_discrete(name = "(rho1, rho12, rho2)") +
    xlab("n") + ylab("Coefficient Value") +
    ggtitle("Zero Betas") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

# nonzero betas
betas_df[coef == "beta6", ratio := value / -0.3]
betas_df[coef == "beta7", ratio := value / -0.1]
betas_df[coef == "beta8", ratio := value / 0.1]
betas_df[coef == "beta9", ratio := value / 0.3]
betas_df[coef == "beta6" | coef == "beta7" | coef == "beta8" | coef == "beta9", ] %>%
    ggplot(aes(x = as.factor(n), y = ratio,
               color = interaction(rho1, rho12, rho2, sep = ", "))) +
    geom_boxplot() + facet_grid(x_dist ~ as.factor(sd) + coef) +
    scale_color_discrete(name = "(rho1, rho12, rho2)") +
    xlab("n") + ylab("Coefficient Value / True Value") +
    ggtitle("Nonzero Betas") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    geom_hline(aes(yintercept = 0))

selected_betas <- betas_df[n == 5000 & sd > 0.1 & rho1 == 0.3 & rho2 == 0.3 & coef == "beta6" | coef == "beta7" | coef == "beta8" | coef == "beta9", ]

selected_betas[, mean(ratio), by = .(x_dist, coef)] %>%
    xtable(digits = 3)

ggplot(selected_betas,
       aes(x = x_dist, y = ratio, color = x_dist)) +
    geom_boxplot() + facet_grid(~coef) +
    xlab("Distribution of X") + ylab("Coefficient Value / True Value") +
    guides(color = FALSE) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle("True Nonzero Betas; n = 5000, sigma = 0.3, rho1 = rho12 = rho2 = 0.3")

# alpha
betas_df[coef == "alpha", ratio := value / 3]
betas_df[coef == "alpha", ] %>%
    ggplot(aes(x = as.factor(n), y = ratio,
               color = interaction(rho1, rho12, rho2, sep = ", "))) +
    geom_boxplot() + facet_grid(as.factor(sd) ~ x_dist) +
    scale_color_discrete(name = "(rho1, rho12, rho2)") +
    xlab("n") + ylab("Coefficient Value / True Value") +
    ggtitle("alpha")

Prediction Errors

p_small <- errors_df[sd == 0.01 & rho1 == 0.3 & rho2 == 0.3,] %>%
    ggplot(aes(y = value, color = error, x = as.factor(n))) +
    geom_boxplot() + facet_grid(~ x_dist) +
    xlab("n") + ylab("MSE") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle("Error Comparison, Small Errors, rho1 = rho12 = rho3 = 0.3")


p_large <- errors_df[sd > 0.1 & rho1 == 0.3 & rho2 == 0.3,] %>%
    ggplot(aes(y = value, color = error,
               x = as.factor(n))) +
    geom_boxplot() + facet_grid(~ x_dist) +
    xlab("n") + ylab("MSE") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle("Error Comparison, Big Errors, rho1 = rho12 = rho3 = 0.3")
plot_grid(p_small, p_large, nrow = 2)

errors_df[sd == 0.01,] %>%
ggplot(aes(y = value, color = error,
           x = interaction(rho1, rho12, rho2, sep = ", "))) +
    geom_boxplot() + facet_grid(x_dist ~ n) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("(rho1, rho12, rho2)") + ylab("MSE") +
    ggtitle("Error Comparison, Small Errors")

errors_df[sd == 0.3,] %>%
ggplot(aes(y = value, color = error,
           x = interaction(rho1, rho12, rho2, sep = ", "))) +
    geom_boxplot() + facet_grid(x_dist ~ n) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("(rho1, rho12, rho2)") + ylab("MSE") +
    ggtitle("Error Comparison, Big Errors")

Exploration of Results

betas <- c(3, 0, 0, 0, 0, 0, -0.3, -0.1, 0.1, 0.3)
corr_mat <- matrix(0.3, nrow = 9, ncol = 9)
diag(corr_mat) <- 1
n <- 1000

x_simulator2 <- function(n) {
    return(mvtnorm::rmvnorm(n, sigma = corr_mat))
}


test_simulation <- function(sd) {
    error_simulator <- function(n) { rnorm(n, mean = 0, sd = sd) }
    test_dat <- bst235Project:::sim_data(betas, x_simulator2,
                                         error_simulator, n = 10000)
    train_dat <- bst235Project:::sim_data(betas, x_simulator2,
                                          error_simulator, n)
    alasso_cv <- bst235Project:::adaptive_lasso(train_dat$xs,
                                                train_dat$ys)
    # fit true model
    true_mod_betas <- bst235Project:::fit_true_model(train_dat$xs,
                                                     train_dat$ys)

    # get predictions
    true_condmean <- betas[1] + pnorm(test_dat$xs %*% betas[-1])
    true_preds <- true_mod_betas[1] +
        pnorm(test_dat$xs%*% true_mod_betas[-1])
    naive_preds <- predict(alasso_cv, newx = test_dat$xs,
                           s = "lambda.min")

    # do calibration predictions
    train_yhat <- predict(alasso_cv, newx = train_dat$xs,
                          s = "lambda.min")
    best_h <- bst235Project:::select_bandwidth(train_yhat, train_dat$ys,
                                               nfolds = 10, cv = FALSE)
    calibrate_preds <-
        bst235Project:::np_calibrate(test_dat$xs, train_dat$xs,
                                     train_dat$ys, alasso_cv,
                                     bandwidth = best_h)
    ret <- list(test_ys = test_dat$ys,
                true_condmean = true_condmean,
                true_preds = true_preds,
                naive_preds = naive_preds,
                calibrate_preds = calibrate_preds,
                train_yhat = train_yhat,
                train_ys = train_dat$ys)
    return(ret)
}

plot_hists <- function(testsim_obj) {
    par(mfrow = c(2, 2))
    xlims <- range(testsim_obj$test_ys)
    hist(testsim_obj$test_ys, breaks = 50, xlim = xlims,
         main = "True Y", xlab = "True Y")
    hist(testsim_obj$true_preds, breaks = 50, xlim = xlims,
         main = "True Model Predictions",
         xlab = "True Model Predictions")
    hist(testsim_obj$naive_preds, breaks = 50, xlim = xlims,
         main = "ALASSO Predictions", xlab = "ALASSO Predictions")
    hist(testsim_obj$calibrate_preds, breaks = 50, xlim = xlims,
         main = "Calibrated Predictions",
         xlab = "ALASSO Predictions")
}

set.seed(1262)
testsim_smallerr <- test_simulation(sd = 0.01)
testsim_bigerr <- test_simulation(sd = 0.3)

testsim_lst <- list(testsim_smallerr, testsim_bigerr)
title_lst <- c("Small Error", "Big Error")

plot_hists(testsim_smallerr)
plot_hists(testsim_bigerr)

par(mfrow = c(2, 2))
for (i in 1:2) {
    plot(testsim_lst[[i]]$train_yhat, testsim_lst[[i]]$train_ys,
         pch = 20, col = rgb(0, 0, 0, 0.3),
         xlab = "Train Y_hat (ALASSO)", ylab = "Train Y",
         main = paste0("Y vs. Y_hat, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
    plot(testsim_lst[[i]]$train_yhat,
         testsim_lst[[i]]$train_ys - testsim_lst[[i]]$train_yhat,
         pch = 20, col = rgb(0, 0, 0, 0.3),
         xlab = "Train Y_hat (ALASSO)", ylab = "Train Residual",
         main = paste0("Residual, ", title_lst[i]))
}

par(mfcol = c(1, 2))
for (i in 1:2) {
    naive_preds <- testsim_lst[[i]]$naive_preds
    test_ys <- testsim_lst[[i]]$test_ys
    calibrate_preds <- testsim_lst[[i]]$calibrate_preds
    plot(naive_preds, test_ys, pch = 20, col = rgb(0, 0, 0, 0.3),
         xlab = "ALASSO prediction on test", ylab = "True test Y",
         main = paste0("Kernel Predictions, ", title_lst[i]))
    lines(naive_preds[order(naive_preds)],
          calibrate_preds[order(naive_preds)],
          col = "red", lwd = 2)
}

par(mfrow = c(2, 3))
for (i in 1:2) {
    plt_lims <- range(testsim_lst[[i]]$test_ys)
    plot(testsim_lst[[i]]$test_ys, testsim_lst[[i]]$true_preds,
         ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2),
         xlab = "True test Y", ylab = "True model prediction",
         main = paste0("True Model, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
    plot(testsim_lst[[i]]$test_ys, testsim_lst[[i]]$naive_preds,
         ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2),
         xlab = "True test Y", ylab = "ALASSO prediction",
         main = paste0("ALASSO, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
    plot(testsim_lst[[i]]$test_ys, testsim_lst[[i]]$calibrate_preds,
         ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2),
         xlab = "True test Y", ylab = "Kernel prediction",
         main = paste0("Kernel, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
}

par(mfrow = c(2, 3))
for (i in 1:2) {
    plt_lims <- range(testsim_lst[[i]]$true_condmean)
    plot(testsim_lst[[i]]$true_condmean, testsim_lst[[i]]$true_preds,
         ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2),
         xlab = "True Conditional Mean", ylab = "True model prediction",
         main = paste0("True Model, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
    plot(testsim_lst[[i]]$true_condmean, testsim_lst[[i]]$naive_preds,
         ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2),
         xlab = "True Conditional Mean", ylab = "ALASSO prediction",
         main = paste0("ALASSO, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
    plot(testsim_lst[[i]]$true_condmean,
         testsim_lst[[i]]$calibrate_preds,
         ylim = plt_lims, pch = 20, col = rgb(0, 0, 0, 0.2),
         xlab = "True Conditional Mean", ylab = "Kernel prediction",
         main = paste0("Kernel, ", title_lst[i]))
    abline(a = 0, b = 1, col = "red", lwd = 2)
}

Evaluating Cross-Validation

error_simulator <- function(n) { rnorm(n, sd = 0.3) }
n <- 500

system.time({
    set.seed(1262)
    sim_res_cv <- link_viol_sim(1000, betas, x_simulator2, n,
                                error_simulator = error_simulator,
                                testsize = 10000, cv = TRUE)
})
system.time({
    set.seed(1262)
    sim_res_nocv <- link_viol_sim(1000, betas, x_simulator2, n,
                                  error_simulator = error_simulator,
                                  testsize = 10000, cv = FALSE)
})

Boxes in gray with CV, boxes in white without CV. Can see errors are the same.

boxplot(cbind(sim_res_cv$errors, sim_res_nocv$errors),
        col = c(rep("grey", 3), rep("white", 3)),
        xlab = "Prediction Error", ylab = "MSE",
        main = "Errors with/without CV")
print("Median errors with CV:")
apply(sim_res_cv$errors, 2, median)
print("Median errors without CV:")
apply(sim_res_nocv$errors, 2, median)