sandbox/cent_newest.R

#! /usr/bin/env Rscript

# get environment variables
MYSCRATCH <- Sys.getenv('MYSCRATCH')
RESULTDIR <- Sys.getenv('RESULTDIR')
STEPSIZE <- as.numeric(Sys.getenv('STEPSIZE'))
TASKID <- as.numeric(Sys.getenv('SLURM_ARRAY_TASK_ID'))

# set defaults if nothing comes from environment variables
MYSCRATCH[is.na(MYSCRATCH)] <- '.'
RESULTDIR[is.na(RESULTDIR)] <- '.'
STEPSIZE[is.na(STEPSIZE)] <- 1
TASKID[is.na(TASKID)] <- 0

# get command lines arguments
args <- commandArgs(trailingOnly = TRUE)
if(length(args) < 1){
  stop("Not enough arguments. Please use args 'listsize', 'prepare', 'run <itemsize>' or 'merge'")
}

ns <- c(50, 100, 250, 500)
bigB <- 500
K <- c(5, 10, 20, 40)
wrappers <- c("glm_wrapper", "randomforest_wrapper")
# wrappers <- c("glmnet_wrapper")
p <- 10
# TO DO:
# Add a replicate argument for repeated cross-validation estimators
parm <- expand.grid(seed = bigB + 1:bigB,
                    n = ns, K = K, 
                    wrapper = wrappers,
                    stringsAsFactors = FALSE)
load("~/cvtmleauc/scratch/redo_parm_newest.RData")
parm <- redo_parm
# load('~/cvtmleauc/out/allOut_new.RData')
# redo_idx <- which(is.na(out$est_dcvtmle))
# parm <- parm[redo_idx,]
# parm <- parm[1,,drop=FALSE]
# source in simulation Functions
source("~/cvtmleauc/makeData.R")
# load drinf
# library(glmnet)
# devtools::install_github("benkeser/cvtmleAUC", dependencies = TRUE)
library(cvtmleAUC, lib.loc = "/home/dbenkese/R/x86_64-pc-linux-gnu-library/3.4")

 # library(np, lib.loc = "/home/dbenkese/R/x86_64-pc-linux-gnu-library/3.4")
# library(cvAUC)
# library(SuperLearner)
# library(data.table)
# library(glmnet)

# get the list size #########
if (args[1] == 'listsize') {
  cat(nrow(parm))
}

# execute prepare job ##################
if (args[1] == 'prepare') {
  parm_red <- parm[parm$K == parm$K[1] & parm$wrapper == parm$wrapper[1],]
  for(i in 1:nrow(parm_red)){
     set.seed(parm_red$seed[i])
     dat <- makeData(n = parm_red$n[i], p = p)
     save(dat, file=paste0("~/cvtmleauc/scratch/dataList",
                           "_n=",parm_red$n[i],
                           "_seed=",parm_red$seed[i],".RData"))
   }
   print(paste0('initial datasets saved to: ~/cvtmleauc/scratch/dataList ... .RData'))
}

# execute parallel job #################################################
if (args[1] == 'run') {
  if (length(args) < 2) {
    stop("Not enough arguments. 'run' needs a second argument 'id'")
  }
  id <- as.numeric(args[2])
  print(paste(Sys.time(), "arrid:" , id, "TASKID:",
              TASKID, "STEPSIZE:", STEPSIZE))
  for (i in (id+TASKID):(id+TASKID+STEPSIZE-1)) {
    print(paste(Sys.time(), "i:" , i))
    print(parm[i,])
    print(sessionInfo())
    # load data
    load(paste0("~/cvtmleauc/scratch/dataList_",
                "n=",parm$n[i],
                "_seed=",parm$seed[i], 
                ".RData"))
    
    # get estimates of dcvauc
    options(np.messages = FALSE)
    n_replicates <- 20
    fit_dcv <- vector(mode = "list", length = n_replicates)
    fit_cv <- vector(mode = "list", length = n_replicates)
    for(j in seq_len(n_replicates)){
      set.seed(j)
      fit_dcv[[j]] <- cvtn_cvtmle(Y = dat$Y, X = dat$X, K = parm$K[i], 
                          learner = parm$wrapper[i], nested_cv = TRUE,
                          nested_K = 39)
      set.seed(j)
    # get estimates of cvtn
      fit_cv[[j]] <- cvtn_cvtmle(Y = dat$Y, X = dat$X, K = parm$K[i], 
                          learner = parm$wrapper[i], nested_cv = FALSE,
                          prediction_list = fit_dcv$prediction_list[1:parm$K[i]])
    }

    # get the truth
    set.seed(parm$seed[i])
    big_n <- 1e5
    big_data <- makeData(n = big_n, p = 10)
    
    # fit on full data
    fit_full <- do.call(parm$wrapper[i], args = list(train = list(X = dat$X, Y = dat$Y), 
                            test = list(X = big_data$X, Y = big_data$Y)))
    bigquantile_full <- quantile(fit_full$psi_nBn_testx[big_data$Y == 1], p = 0.05, type = 8)
    big_testneg_full <- mean(fit_full$psi_nBn_testx <= bigquantile_full)
    true_parameter <- big_testneg_full

    # bootstrap estimate 
    # only needed for K = 5 runs
    # and will be put back in later
    if(parm$K[i] == 5){
      set.seed(parm$seed[i])
      fit_boot <- cvtmleAUC:::boot_corrected_cvtn(Y = dat$Y, X = dat$X, learner = parm$wrapper[i])
    }else{
      fit_boot <- list(NA)
    }
    # c together output
    out <- c( # cvtmle estimates of dcvauc
             fit_dcv[[1]]$est_cvtmle, fit_dcv[[1]]$se_cvtmle,
             # iterations of cvtmle for dcv
             # fit_dcv[[1]]$iter, 
             # initial plug-in estimate of dcv
             fit_dcv[[1]]$est_init, 
             # one-step estimate of dcv
             fit_dcv[[1]]$est_onestep, fit_dcv[[1]]$se_onestep,
             # estimating eqn estimate of dcv
             fit_dcv[[1]]$est_esteq, fit_dcv[[1]]$se_esteq,
             # cvtmle estimate of cv
             fit_cv[[1]]$est_cvtmle, fit_cv[[1]]$se_cvtmle, 
             # iterations of cvtmle for cv
             # fit_cv[[1]]$iter, 
             fit_cv[[1]]$est_init, 
             # one-step estimate of cv
             fit_cv[[1]]$est_onestep, fit_cv[[1]]$se_onestep,
             # estimating eqn estimate of cv
             fit_cv[[1]]$est_esteq, fit_cv[[1]]$se_esteq,
             # full sample split estimate of cv
             fit_dcv[[1]]$est_empirical, fit_dcv[[1]]$se_empirical)

    # now add in MC averaged results for M = 5, 10, 20
    for(M in c(5, 10, 20)){
      avg_dcv <- .getMCAveragedResults(fit_dcv[1:M], logit = FALSE)
      avg_cv <- .getMCAveragedResults(fit_cv[1:M], logit = FALSE)
      out <- c(out, 
               avg_dcv$est_cvtmle, avg_dcv$se_cvtmle,
               # initial plug-in estimate of dcv
               avg_dcv$est_init, 
             # one-step estimate of dcv
             avg_dcv$est_onestep, avg_dcv$se_onestep,
             # estimating eqn estimate of dcv
             avg_dcv$est_esteq, avg_dcv$se_esteq,
             # cvtmle estimate of cv
             avg_cv$est_cvtmle, avg_cv$se_cvtmle, 
             # iterations of cvtmle for cv
             avg_cv$est_init, 
             # one-step estimate of cv
             avg_cv$est_onestep, avg_cv$se_onestep,
             # estimating eqn estimate of cv
             avg_cv$est_esteq, avg_cv$se_esteq,
             # full sample split estimate of cv
             avg_dcv$est_empirical, avg_dcv$se_empirical)
    }
    out <- c(out, fit_boot[[1]], true_parameter)

    # save output 
    save(out, file = paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=",parm$K[i],
                            "_wrapper=",parm$wrapper[i],
                            ".RData.tmp"))
    file.rename(paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=",parm$K[i],
                            "_wrapper=",parm$wrapper[i],                            
                            ".RData.tmp"),
                paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=",parm$K[i],
                            "_wrapper=",parm$wrapper[i],
                            ".RData"))

    grbg <- c(NULL)
    save(grbg, file = paste0("~/cvtmleauc/out/",i,"-run.dat"))  
  }
}

# merge job ###########################
if (args[1] == 'merge') {   
  ns <- c(50, 100, 250, 500)
  bigB <- 1000
  K <- c(5,10,20,40)
  wrappers <- c("glm_wrapper", "randomforest_wrapper")
  # wrappers <- c("glmnet_wrapper")
  p <- 10
  redo_parm <- NULL
  # TO DO:
  # Add a replicate argument for repeated cross-validation estimators
  parm <- expand.grid(seed = 1:bigB,
                      n = ns, K = K, 
                      wrapper = wrappers,
                      stringsAsFactors = FALSE)
  rslt <- matrix(NA, nrow = nrow(parm), ncol = 66 + 4)
  for(i in 1:nrow(parm)){
    if(file.exists(paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=",parm$K[i],
                            "_wrapper=",parm$wrapper[i],
                            ".RData"))){
      tmp_1 <- get(load(paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=",parm$K[i],
                            "_wrapper=",parm$wrapper[i],
                            ".RData")))
    }else{
      redo_parm <- rbind(redo_parm, parm[i,])
      tmp_1 <- rep(NA, 66)
    }
    rslt[i,] <- c(parm$seed[i], parm$n[i], parm$K[i], parm$wrapper[i], tmp_1)
    if(is.na(tmp_1[length(tmp_1) - 1])){
      if(file.exists(paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=5",
                            "_wrapper=",parm$wrapper[i],
                            ".RData"))){
      boot_rslt_out <- get(load(paste0("~/cvtmleauc/out/outtn_",
                            "n=", parm$n[i],
                            "_seed=",parm$seed[i],
                            "_K=5",
                            "_wrapper=",parm$wrapper[i],
                            ".RData")))
      boot_idx <- length(boot_rslt_out) - 1
      rslt[i,boot_idx] <- boot_rslt_out[boot_idx]
      }
    }
  }
  # # format
  out <- data.frame(rslt, stringsAsFactors = FALSE)

  repeat_names <- c("est_dcvtmle", "se_dcvtmle", 
                 "est_dinit", "est_donestep", "se_donestep",
                 "est_desteq","se_desteq","est_cvtmle","se_cvtmle",
                 "est_init", "est_onestep", "se_onestep",
                 "est_esteq","se_esteq","est_emp","se_emp")
  repeat_names2 <- c("est_dcvtmle", "se_dcvtmle","est_dinit",
                  "est_donestep", "se_donestep",
                 "est_desteq","se_desteq","est_cvtmle","se_cvtmle",
                 "est_init", "est_onestep", "se_onestep",
                 "est_esteq","se_esteq","est_emp","se_emp")

  sim_names <- c("seed","n","K","wrapper",
                 paste0(repeat_names, 1),
                 paste0(repeat_names2, 5),
                 paste0(repeat_names2, 10),
                 paste0(repeat_names2, 20),
                 "est_bootstrap",
                 "truth")  
  colnames(out) <- sim_names
  out[,c(1:3,5:ncol(out))] <- apply(out[,c(1:3,5:ncol(out))], 2, function(y){
    as.numeric(as.character(y))})

  save(out, file=paste0('~/cvtmleauc/out/allOut_cvtn.RData'))
  save(redo_parm, file = "~/cvtmleauc/scratch/redo_parm_newest.RData")
}


# local editing 
if(FALSE){
  setwd("~/Dropbox/R/cvtmleauc/sandbox/simulation/")
  sim <- "cvtn"
  load(paste0("allOut_",sim,".RData"))
    # load("~/cvtmleauc/out/allOut_new.RData")

    library(RColorBrewer)
    my_col <- brewer.pal(9,"Greys")[c(1,2,3,4,5,6)]

    get_sim_rslt <- function(out, parm, wrapper, truth = "truth",
                             estimators = c("dcvtmle1","donestep1",
                                     "cvtmle1","onestep1","emp1",
                                     "dcvtmle5","donestep5",
                                     "cvtmle5","onestep5","emp5",
                                     "dcvtmle10","donestep10",
                                     "cvtmle10","onestep10","emp10",
                                     "dcvtmle20","donestep20",
                                     "cvtmle20","onestep20","emp20",
                                     "bootstrap"
                                     ), ...){
      b <- bp <- v <- m <- cv <- co <- mad <- NULL
      for(i in seq_len(length(parm[,1]))){
        x <- out[out$n == parm$n[i] & out$K == parm$K[i] & out$wrapper == wrapper,]
        b <- rbind(b, colMeans(x[,paste0("est_",estimators)] - x[,truth], na.rm = TRUE))
        bp <- rbind(bp, colMeans((x[,paste0("est_",estimators)] - x[,truth])/x[,truth], na.rm = TRUE))
        v <- rbind(v, apply(x[,paste0("est_",estimators)], 2, var, na.rm = TRUE))
        cv <- rbind(cv, apply(x[,paste0("est_",estimators)], 2, function(y){
          sd(y, na.rm = TRUE) / mean(y, na.rm = TRUE)
        }))
        m <- rbind(m, colMeans((x[,paste0("est_",estimators)] - as.numeric(x[,truth]))^2, na.rm = TRUE))
        mad <- rbind(mad, apply(x[,paste0("est_",estimators)], 2, function(y){ 
          median(y - as.numeric(x[,truth]), na.rm = TRUE)
        }))
        # coverage
        # coverage <- rep(NA, length(estimators))
        # ct <- 0
        # for(est in estimators){
        #   ct <- ct + 1
        #   coverage[ct] <- mean(x[,paste0("est_",est)] - 1.96 * x[,paste0("se_",est)] < x[,truth] & 
        #                   x[,paste0("est_",est)] + 1.96 * x[,paste0("se_",est)] > x[,truth], na.rm = TRUE)
        # }
        # co <- rbind(co, coverage)
      }
      parm <- cbind(parm, b, v, m, mad, cv, bp) #, 
                    # co)
      colnames(parm) <- c("n", "K", paste0("bias_", estimators),
                          paste0("var_", estimators),
                          paste0("mse_", estimators),
                          paste0("mad_", estimators),
                          paste0("cv_", estimators),
                          paste0("bp_", estimators)
                          ) #,
                          # paste0("cov_", estimators))
      return(parm)
    }
    parm <- expand.grid(n = c(50, 100, 250, 500),
                        K = c(5, 10, 20, 40))
    glm_rslt <- get_sim_rslt(out, parm, wrapper = "glm_wrapper")
    randomforest_rslt <- get_sim_rslt(out, parm, wrapper = "randomforest_wrapper")
    
    #---------------------------------
    # bar plots
    #---------------------------------

    make_one_box_plot_row <- function(rslt, metric = "bias", 
                                      est, add_legend = FALSE, 
                                      est_labels = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                      rm_last = TRUE,
                                      leg_x = "topright", leg_y = NULL, 
                                      grid = FALSE, scale = 1, 
                                      add_text = FALSE, 
                                      nx_grid = NULL, ny_grid = BULL, 
                                      pred_algo = "Logistic regression",
                                            yaxis_label, add_ratio = TRUE, 
                                            xaxis_label = "Number CV Folds",
                                            transpose = FALSE, 
                                            log = "y", print_n = FALSE,
                                            relative_est = NULL,
                                            relative_K = NULL, 
                                            absolute_val = TRUE, col, ...){
      for(n in c(50, 100, 250, 500)){
        par(las = 1)
        tmp <- rslt[rslt$n == n, ]
        tmp5 <- tmp[tmp$K == 5,]
        tmp10 <- tmp[tmp$K == 10,]
        tmp20 <- tmp[tmp$K == 20,]
        tmp40 <- tmp[tmp$K == 40,]
        # est <- c("mse_dcvtmle1","mse_donestep1","mse_emp1","mse_bootstrap")
        grbg <- as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))
        if(absolute_val){
          grbg <- abs(grbg)
        }
        if(rm_last){
          grbg[2:4,4] <- NA
        }
        row.names(grbg) <- c(5,10,20,40)
        if(n == 50){
          leg.text <- est_labels
          sp <- c(0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0)
        }else{
          leg.text <- FALSE
          sp <- c(0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0)
        }
        xl <- c(0, 14 + sum(sp))
        if(transpose){
          grbg <- t(grbg)
        }
        if(!is.null(relative_est)){
          grbg <- grbg / grbg[paste(relative_K),relative_est]
        }
        grbg <- grbg * scale
        txt <- c(5,10,20,40)
        if(n == 50) txt <- c(5,10,20,"",5,10,20,40,5,10,20,40)
      
        tmp <- barplot(grbg, legend.text = FALSE, 
          beside=TRUE, log = log, yaxt = "n", names.arg = rep("",4), col = col, 
          space = sp, xlim = xl, ... )

        if(grid){
          grid(nx = nx_grid, ny = ny_grid, lty = 1, col = "gray75", equilogs = FALSE)
          par(new = TRUE)
          tmp <- barplot(grbg, legend.text = FALSE, 
          beside=TRUE, log = log, yaxt = "n", names.arg = rep("",4), col = col, 
          space = sp, xlim = xl, ... )
        }
        mtext(side = 1, outer = FALSE, line = 0.02, text = txt, cex =0.5, 
        at = c(tmp[,1],tmp[,2],tmp[,3]))
      
        mtext(side = 1, outer = FALSE, line = 0.02, text = "K = ", cex =0.5, 
              at = par()$usr[1])
        if(add_text){
          # grbg2 <- format(grbg, digits = 2, zero.print = TRUE, scientific = FALSE)
          grbg2 <- formatC(grbg, digits = 2, format = "f")
          grbg2[grbg > 10] <- ">10"
          grbg2[grepl("N",grbg2)] <- ""          
          text(x = tmp, y = 2.2*10^par()$usr[3], srt = 90, grbg2)
               # adj = c(0,0.001),
        }
        if(n == 50 & add_legend){
          legend(x = leg_x, y= leg_y, xpd = TRUE, fill = unique(col), legend = leg.text, ncol = 2)
        }
      
        if(n == 50){
          axis(side = 2)
          par(las = 0)
          mtext(outer = FALSE, side = 2, line = 3, yaxis_label, cex = 0.75)
          # mtext(outer = FALSE, side = 2, line = 4.5, pred_algo, cex = 1)
        }else{
          par(las = 0)
          axis(side = 2, labels = FALSE)
        }
        if(print_n){
          mtext(outer = FALSE, side = 3, line = 1.5, paste0("n = ", n))
        }
        if(add_ratio){
        }
        if(!is.null(relative_est)){
          abline(h = 1, lty = 3)
        }
      }
    }


    pdf("~/Dropbox/Emory/cross-validated-prediction-metrics/glm_cvtn.pdf",
        height = 6, width = 11)
    layout(matrix(1:12, nrow = 3, ncol = 4,  byrow = TRUE))
    par(mar = c(1.6, 0.6, 1.6, 0.6), mgp = c(2.1, 0.5, 0),
        oma = c(2.1, 5.1, 2.1, 2.1))
    make_one_box_plot_row(rslt = glm_rslt, print_n = TRUE, 
                          est = c("bp_emp1","bp_dcvtmle1",
                                        "bp_donestep1","bp_bootstrap"),
                          grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                          add_legend = TRUE, scale = 100, 
                          leg_x = 6, leg_y = 1000,
                                est_label = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                ylim = c(1e-2,500), yaxis_label = "Absolute Bias (%)",
                                col = my_col[sort(rep(c(1:3,5),4))])
    make_one_box_plot_row(rslt = glm_rslt, 
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                              est = c("cv_emp1","cv_dcvtmle1",
                                            "cv_donestep1","cv_bootstrap"),
                                    est_label = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                    ylim = c(0.05,1), yaxis_label = "Coefficient of variation",
                                    col = my_col[sort(rep(c(1:3,5),4))])
    make_one_box_plot_row(rslt = glm_rslt, 
                              est = c("mse_emp1","mse_dcvtmle1",
                                            "mse_donestep1","mse_bootstrap"),
                                relative_est = "mse_emp1",
                                relative_K = 5,    add_text = TRUE,
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                                    est_label = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                    ylim = c(0.01,5), yaxis_label = "Relative MSE",
                                    col = my_col[sort(rep(c(1:3,5),4))])
    dev.off()

    pdf("~/Dropbox/Emory/cross-validated-prediction-metrics/rf_cvtn.pdf",
        height = 6, width = 11)
    layout(matrix(1:12, nrow = 3, ncol = 4,  byrow = TRUE))
    par(mar = c(1.6, 0.6, 1.6, 0.6), mgp = c(2.1, 0.5, 0),
        oma = c(2.1, 5.1, 2.1, 2.1))
    make_one_box_plot_row(rslt = randomforest_rslt, print_n = TRUE, 
                          est = c("bp_emp1","bp_dcvtmle1",
                                        "bp_donestep1","bp_bootstrap"),
                          grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                          add_legend = TRUE,
                                est_label = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                ylim = c(0.000001,10), yaxis_label = "Absolute Bias",
                                col = my_col[sort(rep(c(1:3,5),4))])
    make_one_box_plot_row(rslt = randomforest_rslt, 
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                              est = c("cv_emp1","cv_dcvtmle1",
                                            "cv_donestep1","cv_bootstrap"),
                                    est_label = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                    ylim = c(0.000001,10), yaxis_label = "Coefficient of variation",
                                    col = my_col[sort(rep(c(1:3,5),4))])
    make_one_box_plot_row(rslt = randomforest_rslt, 
                              est = c("mse_emp1","mse_dcvtmle1",
                                            "mse_donestep1","mse_bootstrap"),
                                relative_est = "mse_emp1",
                                relative_K = 5,add_text = TRUE,
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                                    est_label = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                    ylim = c(0.01,5), yaxis_label = "Relative MSE",
                                    col = my_col[sort(rep(c(1:3,5),4))])
    dev.off()




#---------------------------
    # AUC PLOTS
    setwd("~/Dropbox/R/cvtmleauc/sandbox/simulation")
    load("allOut_cvauc.RData")

    get_sim_rslt_auc <- function(out, parm, wrapper, truth = "truth",
                             estimators = c("dcvtmle1","donestep1","desteq1",
                                     "cvtmle1","onestep1","emp1",
                                     "dcvtmle5","donestep5",
                                     "cvtmle5","onestep5","emp5",
                                     "dcvtmle10","donestep10",
                                     "cvtmle10","onestep10","emp10",
                                     "dcvtmle20","donestep20",
                                     "cvtmle20","onestep20","emp20",
                                     "bootstrap", "lpo"
                                     ), ...){
       b <- bp <- v <- m <- cv <- co <- mad <- NULL
      for(i in seq_len(length(parm[,1]))){
        x <- out[out$n == parm$n[i] & out$K == parm$K[i] & out$wrapper == wrapper,]
        b <- rbind(b, colMeans(x[,paste0("est_",estimators)] - x[,truth], na.rm = TRUE))
        bp <- rbind(bp, colMeans((x[,paste0("est_",estimators)] - x[,truth])/x[,truth], na.rm = TRUE))
        v <- rbind(v, apply(x[,paste0("est_",estimators)], 2, var, na.rm = TRUE))
        cv <- rbind(cv, apply(x[,paste0("est_",estimators)], 2, function(y){
          sd(y, na.rm = TRUE) / mean(y, na.rm = TRUE)
        }))
        m <- rbind(m, colMeans((x[,paste0("est_",estimators)] - as.numeric(x[,truth]))^2, na.rm = TRUE))
        mad <- rbind(mad, apply(x[,paste0("est_",estimators)], 2, function(y){ 
          median(y - as.numeric(x[,truth]), na.rm = TRUE)
        }))
        # coverage
        # coverage <- rep(NA, length(estimators))
        # ct <- 0
        # for(est in estimators){
        #   ct <- ct + 1
        #   coverage[ct] <- mean(x[,paste0("est_",est)] - 1.96 * x[,paste0("se_",est)] < x[,truth] & 
        #                   x[,paste0("est_",est)] + 1.96 * x[,paste0("se_",est)] > x[,truth], na.rm = TRUE)
        # }
        # co <- rbind(co, coverage)
      }
      parm <- cbind(parm, b, v, m, mad, cv, bp) #, 
                    # co)
      colnames(parm) <- c("n", "K", paste0("bias_", estimators),
                          paste0("var_", estimators),
                          paste0("mse_", estimators),
                          paste0("mad_", estimators),
                          paste0("cv_", estimators),
                          paste0("bp_", estimators)
                          ) #,
      return(parm)
    }
    parm <- expand.grid(n = c(50, 100, 250, 500),
                        K = c(5, 10, 20, 40))
    glm_rslt_auc <- get_sim_rslt_auc(out, parm, wrapper = "glm_wrapper")
    randomforest_rslt_auc <- get_sim_rslt_auc(out, parm, wrapper = "randomforest_wrapper")
    

    make_one_box_plot_row_auc <- function(rslt, metric = "bias", 
                                  est, add_legend = FALSE, 
                                  est_labels = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                  rm_last = TRUE,
                                  grid = FALSE, 
                                  scale = 1, 
                                  leg_y = NULL, 
                                  leg_x = "topleft",
                                  add_text = FALSE, 
                                  nx_grid = NULL, ny_grid = BULL, 
                                  pred_algo = "Logistic regression",
                                        yaxis_label, add_ratio = TRUE, 
                                        xaxis_label = "Number CV Folds",
                                        transpose = FALSE, 
                                        log = "y", print_n = FALSE,
                                        relative_est = NULL,
                                        relative_K = NULL, 
                                        absolute_val = TRUE, col, ...){
      for(n in c(50, 100, 250, 500)){
          par(las = 0)
        tmp <- rslt[rslt$n == n, ]
        tmp5 <- tmp[tmp$K == 5,]
        tmp10 <- tmp[tmp$K == 10,]
        tmp20 <- tmp[tmp$K == 20,]
        tmp40 <- tmp[tmp$K == 40,]
        # est <- c("mse_dcvtmle1","mse_donestep1","mse_emp1","mse_bootstrap")
        grbg <- as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))
        if(absolute_val){
          grbg <- abs(grbg)
        }
        if(rm_last){
          grbg[2:4,5] <- NA
          grbg[2:4,6] <- NA
        }
        row.names(grbg) <- c(5,10,20,40)

        if(!is.null(relative_est)){
          grbg <- grbg / grbg[paste(relative_K),relative_est]
        }
        if(n == 50){
          leg.text <- est_labels
          sp <- c(0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,-2,0,0,0)
        }else{
          leg.text <- FALSE
          sp <- c(0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,-2,0,0,0)
        }
        txt <- c(5,10,20,40)
        if(n == 50) txt <- c(5,10,20,"",5,10,20,40,5,10,20,40,5,10,20,40)
        grbg <- grbg * scale
        xl <- c(0, 21 + sum(sp))
        tmp <- barplot(grbg, legend.text = FALSE,
                       beside=TRUE, log = "y", yaxt = "n", 
                       names.arg = rep("",dim(grbg)[2]), 
                       col = col, space = sp, xlim = xl, ... )
        mtext(side = 1, outer = FALSE, line = 0.02, text = txt, cex =0.5, 
              at = c(tmp[,1],tmp[,2],tmp[,3],tmp[,4]))
        mtext(side = 1, outer = FALSE, line = 0.02, text = "K = ", cex =0.5, 
              at = par()$usr[1])
        if(grid){
          grid(nx = nx_grid, ny = ny_grid, lty = 1, col = "gray75", equilogs = FALSE)
          par(new = TRUE)
          tmp <- barplot(grbg, legend.text = FALSE, space = sp, xlim = xl, 
          beside=TRUE, log = log, yaxt = "n", names.arg = rep("",dim(grbg)[2]), col = col, 
          ... )
        }
        if(add_text){
          # grbg2 <- format(grbg, digits = 2, zero.print = TRUE, scientific = FALSE)
          grbg2 <- formatC(grbg, digits = 2, format = "f")
          grbg2[grbg > 10] <- ">10"
          grbg2[grepl("N",grbg2)] <- ""          
          text(x = tmp, y = 1.15*10^par()$usr[3], srt = 90, grbg2)
               # adj = c(0,0.001),
        }
        if(add_legend){
          if(n == 50){
            legend(x = leg_x, y = leg_y, xpd = TRUE, fill = unique(col), legend = leg.text, ncol = 2)
          }
        }
        if(n == 50){
          par(las = 2)
          axis(side = 2)
          par(las = 0)
          mtext(outer = FALSE, side = 2, line = 3, yaxis_label, cex = 0.75)
          # mtext(outer = FALSE, side = 2, line = 4, "Logistic Regression", cex = 1)
        }else{
          par(las = 2)
          axis(side = 2, labels = FALSE)
        }
        if(print_n){
          par(las = 0)
          mtext(outer = FALSE, side = 3, line = 1.5, paste0("n = ", n))
        }        
        if(!is.null(relative_est)){
          abline(h = 1, lty = 3)
        }
      }
    }

    pdf("~/Dropbox/Emory/cross-validated-prediction-metrics/glm_cvauc.pdf",
        height = 6, width = 12)
    layout(matrix(1:12, nrow = 3, ncol = 4,  byrow = TRUE))
    par(mar = c(1.6, 0.6, 1.6, 0.6), mgp = c(2.1, 0.5, 0),
        oma = c(2.1, 5.1, 2.1, 2.1))
    make_one_box_plot_row_auc(rslt = glm_rslt_auc, print_n = TRUE, 
                          est = c("bp_emp1","bp_dcvtmle1",
                                        "bp_donestep1","bp_desteq1","bp_bootstrap","bp_lpo"),
                          grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                          add_legend = TRUE, scale = 100, leg_x = 0, leg_y = 150, 
                                est_label = c("CVEMP", "CVTMLE","CVOS","CVEE","BOOT","LPO"),
                                ylim = c(1e-2,50), yaxis_label = "Absolute Bias (%)",
                                col = my_col[sort(rep(1:6,4))])
    make_one_box_plot_row_auc(rslt = glm_rslt_auc, 
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                              est = c("cv_emp1","cv_dcvtmle1",
                                            "cv_donestep1","cv_desteq1","cv_bootstrap","cv_lpo"),
                                    est_label = c("CVEMP", "CVTMLE","CVOS","CVEE","BOOT","LPO"),
                                    ylim = c(1e-2,0.2), yaxis_label = "Coefficient of variation",
                                    col = my_col[sort(rep(1:6,4))])
    make_one_box_plot_row_auc(rslt = glm_rslt_auc, 
                              est = c("mse_emp1","mse_dcvtmle1",
                                            "mse_donestep1","mse_desteq1","mse_bootstrap", "mse_lpo"),
                                relative_est = "mse_emp1",
                                relative_K = 5,add_text = TRUE,
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                                    est_label = c("CVEMP", "CVTMLE","CVOS","CVEE","BOOT","LPO"),
                                    ylim = c(0.5,1.5), yaxis_label = "Relative MSE",
                                    col = my_col[sort(rep(1:6,4))])
      dev.off()


    pdf("~/Dropbox/Emory/cross-validated-prediction-metrics/rf_cvauc.pdf",
        height = 6, width = 12)
    layout(matrix(1:12, nrow = 3, ncol = 4,  byrow = TRUE))
    par(mar = c(1.6, 0.6, 1.6, 0.6), mgp = c(2.1, 0.5, 0),
        oma = c(2.1, 5.1, 2.1, 2.1))
    make_one_box_plot_row_auc(rslt = randomforest_rslt_auc, print_n = TRUE, 
                              est = c("bp_emp1","bp_dcvtmle1",
                                        "bp_donestep1","bp_desteq1","bp_bootstrap","bp_lpo"),
                          grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                          add_legend = TRUE, scale = 100,  leg_x = 0, leg_y = 150, 
                                est_label = c("CVEMP", "CVTMLE","CVOS","CVEE","BOOT","LPO"),
                                ylim = c(1e-2,50), yaxis_label = "Absolute Bias (%)",
                                col = my_col[sort(rep(1:6,4))])
    make_one_box_plot_row_auc(rslt = randomforest_rslt_auc, 
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                              est = c("cv_emp1","cv_dcvtmle1",
                                        "cv_donestep1","cv_desteq1","cv_bootstrap","cv_lpo"),
                                    est_label = c("CVEMP", "CVTMLE","CVOS","CVEE","BOOT","LPO"),
                                    ylim = c(1e-3,0.2), yaxis_label = "Coefficient of variation",
                                    col = my_col[sort(rep(1:6,4))])
    make_one_box_plot_row_auc(rslt = randomforest_rslt_auc, 
                              est = c("mse_emp1","mse_dcvtmle1",
                                        "mse_donestep1","mse_desteq1","mse_bootstrap","mse_lpo"),
                                relative_est = "mse_emp1",
                                relative_K = 5,add_text = TRUE,
                                grid = TRUE, ny_grid = NULL, nx_grid = 0,                           
                                    est_label = c("CVEMP", "CVTMLE","CVOS","CVEE","BOOT","LPO"),
                                    ylim = c(0.5,1.5), yaxis_label = "Relative MSE",
                                    col = my_col[sort(rep(1:6,4))])
    dev.off()


    #---------------------------------
    # bar plots
    #---------------------------------

    make_side_by_side_bar_plots <- function(glm_rslt, randomforest_rslt,
                                            est, est_labels, rm_last = TRUE,
                                            yaxis_label, add_ratio = FALSE, 
                                            relative_est = NULL, relative_K = NULL, 
                                            xaxis_label = "Number CV Folds",
                                            absolute_val = TRUE, col, ...){
      layout(matrix(1:8, nrow = 2, ncol = 4,  byrow = TRUE))
      par(mar = c(1.6, 0.6, 1.6, 0.6), mgp = c(2.1, 0.5, 0),
          oma = c(2.1, 5.1, 2.1, 2.1))
      for(n in c(50, 100, 250, 500)){
          par(las = 1)
        tmp <- glm_rslt[glm_rslt$n == n, ]
        tmp5 <- tmp[tmp$K == 5,]
        tmp10 <- tmp[tmp$K == 10,]
        tmp20 <- tmp[tmp$K == 20,]
        tmp40 <- tmp[tmp$K == 40,]
        # est <- c("mse_dcvtmle1","mse_donestep1","mse_emp1","mse_bootstrap")
        grbg <- as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))
        if(absolute_val){
          grbg <- abs(grbg)
        }
        if(rm_last){
          grbg[2:4,4] <- NA
        }
        row.names(grbg) <- c(5,10,20,40)

        if(!is.null(relative_est)){
          grbg <- grbg / grbg[paste(relative_K),relative_est]
        }
        if(n == 50){
          leg.text <- est_labels
        }else{
          leg.text <- FALSE
        }
        tmp <- barplot(grbg, legend.text = FALSE,
          beside=TRUE, log = "y", yaxt = "n", names.arg = rep("",dim(grbg)[2]), col = col, ... )
          mtext(side = 1, outer = FALSE, line = 0.02, text = c(5,10,20,40), cex =0.5, 
        at = c(tmp[,1],tmp[,2],tmp[,3]))
                  mtext(side = 1, outer = FALSE, line = 0.02, text = "K = ", cex =0.5, 
              at = par()$usr[1])
        if(n == 50){
          legend(x = "topright", fill = unique(col), legend = leg.text)
        }
        if(n == 50){
          par(las = 0)
          axis(side = 2)
          mtext(outer = FALSE, side = 2, line = 2, yaxis_label, cex = 0.75)
          mtext(outer = FALSE, side = 2, line = 4, "Logistic Regression", cex = 1)
        }else{
          par(las = 0)
          axis(side = 2, labels = FALSE)
        }
        mtext(outer = FALSE, side = 3, line = 0.5, paste0("n = ", n))
        if(!is.null(relative_est)){
          abline(h = 1, lty = 3)
        }
      }
      for(n in c(50, 100, 250, 500)){
        par(las = 1)
        tmp <- randomforest_rslt[randomforest_rslt$n == n, ]
        tmp5 <- tmp[tmp$K == 5,]
        tmp10 <- tmp[tmp$K == 10,]
        tmp20 <- tmp[tmp$K == 20,]
        tmp40 <- tmp[tmp$K == 40,]
        # est <- c("mse_dcvtmle1","mse_donestep1","mse_emp1","mse_bootstrap")
        grbg <- as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))
        if(absolute_val){
          grbg <- abs(grbg)
        }
        if(rm_last){
          grbg[2:4,4] <- NA
        }
            row.names(grbg) <- c(5,10,20,40)
        if(!is.null(relative_est)){
          grbg <- grbg / grbg[paste(relative_K),relative_est]
        }
        tmp <- barplot(grbg,  beside=TRUE, log = "y", yaxt = "n", names.arg = rep("",dim(grbg)[2]), 
                       col = col, ...)
        mtext(side = 1, outer = FALSE, line = 0.02, text = c(5,10,20,40), cex =0.5, 
              at = c(tmp[,1],tmp[,2],tmp[,3]))
        mtext(side = 1, outer = FALSE, line = 0.02, text = "K = ", cex =0.5, 
              at = par()$usr[1])
        # mtext(side = 1, outer = FALSE, line = 0.02, text = c(5,10,20,40), cex =0.5, 
        #       at = c(mean(tmp[2:3,1]),mean(tmp[2:3,2]),mean(tmp[2:3,3])))
        par(las = 0)
        if(n == 50){
          axis(side = 2)
          mtext(outer = FALSE, side = 2, line = 2, yaxis_label, cex = 0.75)
          mtext(outer = FALSE, side = 2, line = 4, "Random Forest", cex = 1)
        }else{
          axis(side = 2, labels = FALSE)
        }
        if(!is.null(relative_est)){
          abline(h = 1, lty = 3)
        }
      }
    }






    #----------
    # not relative plots
    # bias
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("bias_emp1","bias_dcvtmle1",
                                        "bias_donestep1","bias_bootstrap"),
                                est_labels = c("CVEMP", "CVTMLE","CVOS","BOOT"),
                                ylim = c(0.000001,10), yaxis_label = "Absolute Bias",
                                col = my_col[sort(rep(1:4,4))])

    # variance
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("var_emp1","var_dcvtmle1",
                                        "var_donestep1","var_bootstrap"),
                                est_labels = c("CVTMLE", "CVOS","Empirical","Bootstrap"),
                                ylim = c(0.00001,10), yaxis_label = "Variance",
                                col = my_col[sort(rep(1:4,4))])
    # relative mse
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("mse_emp1","mse_dcvtmle1",
                                        "mse_donestep1",
                                        "mse_bootstrap"),
                                relative_est = "mse_emp1",
                                relative_K = 5,
                                est_label = c("CV-EMP", "CV-TMLE","CV-OS","BOOT"),
                                ylim = c(0.01,10), yaxis_label = "Relative MSE",
                                col = my_col[sort(rep(1:4,4))])  





    # variance
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("var_emp1","var_dcvtmle1",
                                        "var_donestep1","var_bootstrap"),
                                relative_est = "var_emp1",
                                relative_K = 5,
                                est_label = c("CV-EMP", "CV-TMLE","CV-OS","BOOT"),
                                ylim = c(0.1,10), yaxis_label = "Relative variance",
                                col = my_col[sort(rep(1:4,4))])


    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("mse_emp1","mse_dcvtmle1",
                                        "mse_donestep1",
                                        "mse_bootstrap"),
                                relative_est = "mse_emp1",
                                relative_K = 5,
                                est_label = c("CV-EMP", "CV-TMLE","CV-OS","BOOT"),
                                ylim = c(0.01,10), yaxis_label = "Relative MSE",
                                col = my_col[sort(rep(1:4,4))])   
    # mad
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("mad_emp1","mad_dcvtmle1",
                                        "mad_donestep1",
                                        "mad_bootstrap"),
                                relative_est = "mad_emp1",
                                relative_K = 5,
                                est_label = c("CV-EMP", "CV-TMLE","CV-OS","BOOT"),
                                ylim = c(0.01,100), yaxis_label = "Relative MAD",
                                col = my_col[sort(rep(1:4,4))])    
    # cv
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("cv_emp1","cv_dcvtmle1",
                                        "cv_donestep1",
                                        "cv_bootstrap"),
                                relative_est = "cv_emp1",
                                relative_K = 5,
                                est_label = c("CV-EMP", "CV-TMLE","CV-OS","BOOT"),
                                ylim = c(0.01,100), yaxis_label = "Relative COEF VAR",
                                col = my_col[sort(rep(1:4,4))])



    # bias by CV Repeats for CVTMLE
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("bias_dcvtmle1","bias_dcvtmle5",
                                        "bias_dcvtmle10","bias_dcvtmle20"),
                                rm_last = FALSE, transpose = TRUE, 
                                est_label = paste0("CVTMLE ", c(1,5,10,20)," repeats"),
                                ylim = c(0.00001,100), yaxis_label = "Absolute bias",,
                                col = my_col[sort(rep(1:4,4))])

    # variance by CV Repeats for CVTMLE
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("var_dcvtmle1","var_dcvtmle5",
                                        "var_dcvtmle10","var_dcvtmle20"),
                                rm_last = FALSE, transpose = TRUE, 
                                est_label = paste0("CVTMLE ", c(1,5,10,20)," repeats"),
                                ylim = c(0.0001,0.01), yaxis_label = "Variance",
                                col = my_col[sort(rep(1:4,4))])

    # mse by CV Repeats for CVTMLE
    make_side_by_side_bar_plots(glm_rslt, randomforest_rslt, 
                                est = c("mse_dcvtmle1","mse_dcvtmle5",
                                        "mse_dcvtmle10","mse_dcvtmle20"),
                                rm_last = FALSE, transpose = TRUE, 
                                est_label = paste0("CVTMLE ", c(1,5,10,20)," repeats"),
                                ylim = c(0.0001,0.1), yaxis_label = "MSE",
                                col = my_col[sort(rep(1:4,4))])





    # compare mse of cvtmle with 40 folds to mse of empirical with 5 folds
    make_mse_compare_one_repeat <- function(rslt, B, legend = FALSE){
      cvt_n_glm <- rslt[,paste0("mse_dcvtmle",B)][rslt$K == 40][2:4]
      cvo_n_glm <- rslt[,paste0("mse_donestep",B)][rslt$K == 40][2:4]
      emp_n_glm <- rslt[,paste0("mse_emp",B)][rslt$K == 5][2:4]

      plot(y = cvt_n_glm/emp_n_glm, x = 1:3, xaxt = "n", yaxt = "n", bty = "n",
           xlim = c(1,3), ylim = c(0,2), type = "b",
           xlab = "Sample size", ylab = "MSE / Empirical with K = 5")
      axis(side = 1, at = 1:3, labels = c(100, 250, 500))
      axis(side = 2)
      abline(h = 1, lty = 3)
      points(y = cvo_n_glm/emp_n_glm, x = 1:3, pch = 2, type = "b", lty = 2)
      if(legend){
        legend(x = "topleft", c("CVTMLE, K = 40", "CVOS, K = 40"), pch = 1:2,
               bty = "n")
      }
    }

    layout(matrix(1:8, nrow = 2, ncol = 4,  byrow = TRUE))
    for(b in c(1, 5, 10, 20)){
      make_mse_compare_one_repeat(rslt = glm_rslt, b, legend = ifelse(b == 1, TRUE, FALSE))
      mtext(side = 3, text = paste0("MC Repeats = ", b))
    }
    for(b in c(1, 5, 10, 20)){
      make_mse_compare_one_repeat(rslt = randomforest_rslt, b, legend = ifelse(b == 1, TRUE, FALSE))
    }




    cvt_n_glm <- glm_rslt$mse_dcvtmle1[glm_rslt$K == 40]
    cvt_n_glm <- glm_rslt$mse_dcvtmle1[glm_rslt$K == 40]

    ratio_cvtmle_to_mse <- 


    #---------------------------------
    # Bias plots
    #---------------------------------
    # top row = glm bias ~ K for each n 
    # bottom row = random forest bias ~ K for each n

    layout(matrix(1:8, nrow = 2, ncol = 4,  byrow = TRUE))
    par(mar = c(1.6, 0.6, 0.6, 0.6), mgp = c(2.1, 0.5, 0),
        oma = c(2.1, 5.1, 2.1, 2.1))
    for(n in c(50, 100, 250, 500)){
      tmp <- glm_rslt[glm_rslt$n == n, ]
      tmp5 <- tmp[tmp$K == 5,]
      tmp10 <- tmp[tmp$K == 10,]
      tmp20 <- tmp[tmp$K == 20,]
      tmp40 <- tmp[tmp$K == 40,]
      est <- c("bias_dcvtmle1","bias_donestep1","bias_emp1")
      grbg <- t(abs(as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))))
      colnames(grbg) <- c(5,10,20,40)
      barplot(grbg,
        beside=TRUE, log = "y", yaxt = "n", ylim = c(0.0001, 200))
      if(n == 50){
        axis(side = 2)
        mtext(outer = FALSE, side = 2, line = 2, "Absolute bias", cex = 0.75)
        mtext(outer = FALSE, side = 2, line = 4, "Logistic Regression", cex = 1)
      }else{
        axis(side = 2, labels = FALSE)
      }
      mtext(outer = FALSE, side = 3, line = 0.5, paste0("n = ", n))
    }
    for(n in c(50, 100, 250, 500)){
      tmp <- randomforest_rslt[randomforest_rslt$n == n, ]
      tmp5 <- tmp[tmp$K == 5,]
      tmp10 <- tmp[tmp$K == 10,]
      tmp20 <- tmp[tmp$K == 20,]
      tmp40 <- tmp[tmp$K == 40,]
      est <- c("bias_dcvtmle1","bias_donestep1","bias_emp1")
      grbg <- t(abs(as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))))
      colnames(grbg) <- c(5,10,20,40)
      barplot(grbg,
        beside=TRUE, log = "y", yaxt = "n", ylim = c(0.0001, 200))
      mtext(side = 1, outer = FALSE, line = 2, "Number of CV Folds", cex =0.75)
      if(n == 50){
        axis(side = 2)
        mtext(outer = FALSE, side = 2, line = 2, "Absolute bias", cex = 0.75)
        mtext(outer = FALSE, side = 2, line = 4, "Random Forest", cex = 1)
      }else{
        axis(side = 2, labels = FALSE)
      }
    }


    #--------------------------------
    # MSE plots
    #--------------------------------
    make_mse_compare_plot <- function(rslt, est1, est2, ns = c(100, 250, 500, 750),
                                      Ks = c(5, 10, 20, 40),...){
    # make matrix of relative MSE
    n_ct <- 0
    K_ct <- 0
    rel_mse <- matrix(NA, length(ns), length(Ks))
    for(n in ns){
      n_ct <- n_ct + 1
      for(K in Ks){
        K_ct <- K_ct + 1
        rel_mse[n_ct, K_ct] <- rslt[rslt$n == n & rslt$K == K, paste0("mse_",est1)] / 
                                  rslt[rslt$n == n & rslt$K == K, paste0("mse_",est2)]      
      }
      K_ct <- 0
    }
    row.names(rel_mse) <- ns
    colnames(rel_mse) <- Ks
    
    superheat::superheat(X = rel_mse, X.text = round(rel_mse, 2), scale = FALSE, 
              pretty.order.rows = FALSE, 
              pretty.order.cols = FALSE, heat.col.scheme = "red",
              row.title = "Sample size", column.title = "CV folds",
              title = paste0("MSE(",est1,")/MSE(",est2,")"),
    # plot_done()
              legend.breaks = seq(min(rel_mse), max(rel_mse), by = 0.1), ...)
    }

    # CV TMLE vs. empirical

    for(rslt in c("glm_rslt","randomforest_rslt","glmnet_rslt","stepglm_rslt")){
      pdf(paste0("~/cvtmleauc/",rslt,"_perf.pdf"))
      # comparing to emp
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "dcvtmle", est2 = "emp")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "cvtmle", est2 = "emp")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "desteq", est2 = "emp")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "esteq", est2 = "emp")  
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "donestep", est2 = "emp")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "onestep", est2 = "emp")
      # comparing to eachother
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "onestep", est2 = "cvtmle")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "donestep", est2 = "dcvtmle")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "onestep", est2 = "esteq")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "donestep", est2 = "desteq")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "esteq", est2 = "cvtmle")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "desteq", est2 = "dcvtmle")
      # comparing cv to dcv 
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "onestep", est2 = "donestep")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "cvtmle", est2 = "dcvtmle")
        make_mse_compare_plot(eval(parse(text = rslt)), est1 = "esteq", est2 = "desteq")
      dev.off()
    }
  #--------------------------------
  # CV TMLE vs. Empirical 
  #--------------------------------
  pdf("mse_results.pdf")
  superheat(X = rel_mse_cvtmle, X.text = round(rel_mse_cvtmle, 2), scale = FALSE, 
            pretty.order.rows = FALSE, 
            pretty.order.cols = FALSE, heat.col.scheme = "red",
            row.title = "Sample size", column.title = "CV folds",
            legend.breaks = c(0.7, 0.8, 0.9, 1),
            title = "MSE CVTMLE / MSE Empirical")
  #--------------------------------
  # One step vs. Empirical 
  #--------------------------------
  superheat(X = rel_mse_onestep, X.text = round(rel_mse_onestep, 2), scale = FALSE, 
            pretty.order.rows = FALSE, 
            pretty.order.cols = FALSE, heat.col.scheme = "red",
            row.title = "Sample size", column.title = "CV folds",
            legend.breaks = c(0.7, 0.8, 0.9, 1),
            title = "MSE CV One step / MSE Empirical")  
  #--------------------------------
  # CVTMLE vs. Onestep 
  #--------------------------------
  superheat(X = rel_mse_tmlevonestep, X.text = round(rel_mse_tmlevonestep, 2), 
            scale = FALSE, 
            pretty.order.rows = FALSE, 
            pretty.order.cols = FALSE, heat.col.scheme = "red",
            row.title = "Sample size", column.title = "CV folds",
            legend.breaks = c(0.7, 0.8, 0.9, 1),
            title = "MSE CV One step / MSE CVTMLE")
  dev.off()

  #--------------------------------
  # Coverage plots
  #--------------------------------
  # make matrix of relative MSE
  n_ct <- 0
  K_ct <- 0
  cov_cvtmle <- matrix(NA, 4, 4)
  cov_onestep <- matrix(NA, 4, 4)
  cov_tmlevonestep <- matrix(NA, 4, 4)
  for(n in c(100, 250, 500, 750)){
    n_ct <- n_ct + 1
    for(K in c(5, 10, 20, 30)){
      K_ct <- K_ct + 1
      cov_cvtmle[n_ct, K_ct] <- parm$cov_cvtmle[parm$n == n & parm$K == K]     
      cov_onestep[n_ct, K_ct] <- parm$cov_onestep[parm$n == n & parm$K == K] 
      cov_tmlevonestep[n_ct, K_ct] <- parm$cov_empirical[parm$n == n & parm$K == K] 
    }
    K_ct <- 0
  }
  row.names(cov_cvtmle) <- row.names(cov_onestep) <- row.names(cov_tmlevonestep) <- c(100, 250, 500, 750)
  colnames(cov_cvtmle) <- colnames(cov_onestep) <- colnames(cov_tmlevonestep) <- c(5, 10, 20, 30)
  
  #--------------------------------
  # CV TMLE vs. Empirical 
  #--------------------------------
  pdf("coverage_results.pdf")
  superheat(X = cov_cvtmle, X.text = round(cov_cvtmle, 2), scale = FALSE, 
            pretty.order.rows = FALSE, 
            pretty.order.cols = FALSE, heat.col.scheme = "red",
            row.title = "Sample size", column.title = "CV folds",
            legend.breaks = c(0.7, 0.8, 0.9, 1),
            title = "Coverage of nominal 95% CI CVTMLE")
  #--------------------------------
  # One step vs. Empirical 
  #--------------------------------
  superheat(X = cov_onestep, X.text = round(cov_onestep, 2), scale = FALSE, 
            pretty.order.rows = FALSE, 
            pretty.order.cols = FALSE, heat.col.scheme = "red",
            row.title = "Sample size", column.title = "CV folds",
            legend.breaks = c(0.7, 0.8, 0.9, 1),
            title = "Coverage of nominal 95% CI One step")  
  #--------------------------------
  # CVTMLE vs. Onestep 
  #--------------------------------
  superheat(X = cov_tmlevonestep, X.text = round(cov_tmlevonestep, 2), 
            scale = FALSE, 
            pretty.order.rows = FALSE, 
            pretty.order.cols = FALSE, heat.col.scheme = "red",
            row.title = "Sample size", column.title = "CV folds",
            legend.breaks = c(0.7, 0.8, 0.9, 1),
            title = "Coverage of nominal 95% CI Empirical")
  dev.off()

  
    make_side_by_side_bar_plots <- function(glm_rslt, randomforest_rslt,
                                            est, est_labels, rm_last = TRUE,
                                            yaxis_label, add_ratio = TRUE, 
                                            xaxis_label = "Number CV Folds",
                                            transpose = FALSE, 
                                            log = "y",
                                            relative_est = NULL,
                                            relative_K = NULL, 
                                            absolute_val = TRUE, col, ...){
      layout(matrix(1:8, nrow = 2, ncol = 4,  byrow = TRUE))
      par(mar = c(1.6, 0.6, 1.6, 0.6), mgp = c(2.1, 0.5, 0),
          oma = c(2.1, 5.1, 2.1, 2.1))
      for(n in c(50, 100, 250, 500)){
        par(las = 1)
        tmp <- glm_rslt[glm_rslt$n == n, ]
        tmp5 <- tmp[tmp$K == 5,]
        tmp10 <- tmp[tmp$K == 10,]
        tmp20 <- tmp[tmp$K == 20,]
        tmp40 <- tmp[tmp$K == 40,]
        # est <- c("mse_dcvtmle1","mse_donestep1","mse_emp1","mse_bootstrap")
        grbg <- as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))
        if(absolute_val){
          grbg <- abs(grbg)
        }
        if(rm_last){
          grbg[2:4,4] <- NA
        }
        row.names(grbg) <- c(5,10,20,40)
        if(n == 50){
          leg.text <- est_labels
          sp <- c(0,0,0,0,0,0,0,0,1,0,0,0,1,0)
        }else{
          leg.text <- FALSE
          sp <- c(0,0,0,0,1,0,0,0,1,0,0,0,1,0)
        }
        if(transpose){
          grbg <- t(grbg)
        }
        if(!is.null(relative_est)){
          grbg <- grbg / grbg[paste(relative_K),relative_est]
        }
        
        txt <- c(5,10,20,40)
        if(n == 50) txt <- c(5,10,20,"",5,10,20,40,5,10,20,40)
      
        tmp <- barplot(grbg, legend.text = FALSE, 
          beside=TRUE, log = log, yaxt = "n", names.arg = rep("",4), col = col, 
          space = sp, ... )

        mtext(side = 1, outer = FALSE, line = 0.02, text = txt, cex =0.5, 
        at = c(tmp[,1],tmp[,2],tmp[,3]))
      
        mtext(side = 1, outer = FALSE, line = 0.02, text = "K = ", cex =0.5, 
              at = par()$usr[1])
      
        if(n == 50){
          legend(x = "topright", fill = unique(col), legend = leg.text, ncol = 2)
        }
      
        if(n == 50){
          axis(side = 2)
          par(las = 0)
          mtext(outer = FALSE, side = 2, line = 3, yaxis_label, cex = 0.75)
          mtext(outer = FALSE, side = 2, line = 4.5, "Logistic Regression", cex = 1)
        }else{
          par(las = 0)
          axis(side = 2, labels = FALSE)
        }
        mtext(outer = FALSE, side = 3, line = 1.5, paste0("n = ", n))

        if(add_ratio){
        }
        if(!is.null(relative_est)){
          abline(h = 1, lty = 3)
        }
      }
      for(n in c(50, 100, 250, 500)){
        par(las = 1)
        tmp <- randomforest_rslt[randomforest_rslt$n == n, ]
        tmp5 <- tmp[tmp$K == 5,]
        tmp10 <- tmp[tmp$K == 10,]
        tmp20 <- tmp[tmp$K == 20,]
        tmp40 <- tmp[tmp$K == 40,]
        # est <- c("mse_dcvtmle1","mse_donestep1","mse_emp1","mse_bootstrap")
        grbg <- as.matrix(rbind(tmp5[,est],tmp10[,est],tmp20[,est],tmp40[,est]))
        if(absolute_val){
          grbg <- abs(grbg)
        }
        if(rm_last){
          grbg[2:4,4] <- NA
        }
        row.names(grbg) <- c(5,10,20,40)
        if(transpose){
          grbg <- t(grbg)
        }
        if(!is.null(relative_est)){
          grbg <- grbg / grbg[paste(relative_K),relative_est]
        }
        txt <- c(5,10,20,40)
        if(n == 50){
          txt <- c(5,10,20,"",5,10,20,40,5,10,20,40)
          sp <- c(0,0,0,0,0,0,0,0,1,0,0,0,1,0)
        }else{
          sp <- c(0,0,0,0,1,0,0,0,1,0,0,0,1,0)
        }
        tmp <- barplot(grbg,  beside=TRUE, log = log, yaxt = "n", names.arg = rep("",4), 
                       col = col, space = sp, ...)
        mtext(side = 1, outer = FALSE, line = 0.02, text = txt, cex =0.5, 
              at = c(tmp[,1],tmp[,2],tmp[,3]))        
        mtext(side = 1, outer = FALSE, line = 0.02, text = "K = ", cex =0.5, 
              at = par()$usr[1])
        # mtext(side = 1, outer = FALSE, line = 0.02, text = c(5,10,20,40), cex =0.5, 
        #       at = c(mean(tmp[2:3,1]),mean(tmp[2:3,2]),mean(tmp[2:3,3])))
        if(n == 50){
          axis(side = 2)
          par(las = 0)
          mtext(outer = FALSE, side = 2, line = 3, yaxis_label, cex = 0.75)
          mtext(outer = FALSE, side = 2, line = 4.5, "Random Forest", cex = 1)
        }else{
          par(las = 0)
          axis(side = 2, labels = FALSE)
        }
        if(!is.null(relative_est)){
          abline(h = 1, lty = 3)
        }
      }
    }


}
benkeser/cvtmleAUC documentation built on May 16, 2019, 2:30 a.m.