revised_paper_results/table1-binary-cov-base-model-sims/run-base-sims-boot.R

#!/usr/bin/env Rscript


## June 18, 2021
## SKG
## Rerun the simulations
## This is for the base model with a binary covariate
## We are using alpha values of .5, .7, .1
## beta0 values of -2.5, -1.5, and -2.75
## and beta1 values of -.5, 0, -.25, 2.25
## There are 15 total sets (so not all combinations are used)
## This version includes a bootstrap resample woo

rep_from_lib <- TRUE


if(rep_from_lib){
    library(InfectionTrees)
} else {
    devtools::load_all()
}
library(dplyr)

set.seed(6172021)

## Simulations to try
M <- 100 # number of simulations for each parameter configuration
B <- 5000 # number of MC samples
K <- 1000 # number of clusters in a data set
n_boot <- 30


par_df1 <- data.frame(
                     M = M,
                     B = B,
                     beta_0 = c(-2.5, -2.5, -2.5,
                         -1.5, -1.5, -1.5),
                     beta_1 = c(.5, 0, -.5,
                                1, 0, -1),
                     gamma = .5)

par_df2 <- data.frame(
                     M = M,
                     B = B,
                     beta_0 = c(-2.5, -2.5, -2.5,
                         -1.5, -1.5, -1.5),
                     beta_1 = c(.5, 0, -.5,
                                .5, 0, -.5),
                     gamma = .7)

par_df3 <-  data.frame(
                     M = M,
                     B = B,
                     beta_0 = c(-2.75, -2.75, -2.75),
                     beta_1 = c(2.25, 0, -.25),
                     gamma = .1)
###########################################################################

## data frame of simulation parameter configurations
par_df <- dplyr::bind_rows(par_df1, par_df2, par_df3)

out_list <- vector(mode = "list", length = nrow(par_df))

#########################################################################

## Generate a small set of pre-sampled trees to get started
binary_cluster_summaries <- data.frame(cluster_size = c(1, 1, 2, 2, 2),
                                       x_pos = c(0, 1, 0, 1, 2),
                                       x_neg = c(1, 0, 2, 1, 0),
                                       freq = 1)
mc_trees <- sample_mc_binary_cov(B = B,
                                 observed_cluster_summaries =
                                     binary_cluster_summaries) %>%
    dplyr::select(-freq)



## Run the simulations
total_time <- proc.time()[3]
for(zz in 1:nrow(par_df)){
  t_params <- proc.time()[3]
  beta_0 <- par_df$beta_0[zz]
  beta_1 <- par_df$beta_1[zz]
  gamma <- par_df$gamma[zz]
  cat(sprintf("Parameter set %d \n", zz))
  cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n",
                beta_0, beta_1, gamma))


  ## For a given set of parameters
  best_params_mat <- matrix(0, nrow = M, ncol = 2)
  coverage_mat <- matrix(0, nrow = M, ncol = 4)
  colnames(coverage_mat) <- c("boot_beta0", "boot_beta1",
                              "fi_beta0", "fi_beta1")
  cluster_sizes <- numeric(K * M) # store all the cluster sizes from a simulated set of
  ## observed data
  max_cluster_sizes <- matrix(M)
  for(mm in 1:M){
    t_sims <- proc.time()[3]
    cat( sprintf("Simulation %d \n", mm))
    ## Simulate a branching process data set
    ## Distribution of xs
    sample_covariates_df <- data.frame(x = c(rep(0, gamma * 10),
                                             rep(1, (1-gamma) * 10)))


    data <- simulate_bp(K = K ,
                                      inf_params = c(beta_0, beta_1),
                                      sample_covariates_df = sample_covariates_df,
                                      covariate_names = "x",
                                      max_size = 60)
    ## Summarize the data set
    binary_cluster_summaries <- summarize_binary_clusters(data)
    cluster_sizes[(mm-1) *K +  (1:K)] <- rep(binary_cluster_summaries$cluster_size,
                                         times = binary_cluster_summaries$freq)
    
    max_cluster_sizes[mm] <- max(binary_cluster_summaries$cluster_size)
    cat(sprintf("max cluster size is %d \n", max_cluster_sizes[mm]))


    ## ##################################3
    ## SAMPLE (ADDITIONAL) MC TREES
    ## #######################################
    size_npos <- with(binary_cluster_summaries,
                      paste0(cluster_size, "-", x_pos))
    sampled_size_npos <- with(mc_trees,
                              paste0(cluster_size, "-", x_pos))
    missing_sample_ids <- which(!(size_npos %in% sampled_size_npos))
    if(length(missing_sample_ids) > 0){
        new_cluster_summaries <- binary_cluster_summaries[missing_sample_ids,]
        print("Sampling the following trees")
        print(size_npos[missing_sample_ids])
        new_mc_trees <- sample_mc_binary_cov(B = B,
                                             observed_cluster_summaries =
                                                 new_cluster_summaries) %>%
            dplyr::select(-freq)
        mc_trees <- bind_rows(mc_trees, new_mc_trees) %>%
            arrange(cluster_size, x_pos)
    }
    #######################################
    ## OPTIMIZE
    ######################################
    inf_params_0 <- c(0,0)

    best_params <- optim(inf_params_0, fn = bp_loglike_binary_cov,
                         obs_data_summary = binary_cluster_summaries,
                        mc_samples_summary = mc_trees,
                                    return_neg = TRUE,
                       lower = c(-5, -5),
                       upper = c(0, 5),
                       method = "L-BFGS-B",
                       hessian = TRUE)
    best_params_mat[mm,] <- best_params$par
    ## Fisher information (approximately)
    se <- sqrt(diag(solve(best_params$hessian)))
    lower <- best_params$par - 1.96 * se
    upper <- best_params$par + 1.96 * se
    coverage_mat[mm, 3:4] <- ifelse(lower  < c(beta_0, beta_1) &
                                 upper  > c(beta_0, beta_1),
                                 1, 0)
    ## #####################
    ## BOOSTRAP TO GET A SE
    ## #####################
    
    boot_mat <- matrix(0, nrow = n_boot, ncol = 2)
    for(bb in 1:n_boot){
        ## Resample data
        resampled_cluster_inds <- sample(1:nrow(binary_cluster_summaries),
                                         size = sum(binary_cluster_summaries$freq),
                                         prob = binary_cluster_summaries$freq /
                                             sum(binary_cluster_summaries$freq),
                                         replace = TRUE)
        resampled_clusters <- binary_cluster_summaries[resampled_cluster_inds,] %>%
            group_by(cluster_size, x_pos, x_neg) %>%
            summarize(freq = n(), .groups = "drop")

        ## Get estimate of beta
        best_params_boot <- optim(inf_params_0, fn = bp_loglike_binary_cov,
                             obs_data_summary = resampled_clusters,
                             mc_samples_summary = mc_trees,
                             return_neg = TRUE,
                             lower = c(-5, -5),
                             upper = c(0, 5),
                             method = "L-BFGS-B")
        
        boot_mat[bb,] <- best_params_boot$par

    }
    se_boot <- apply(boot_mat, 2, sd)
    lower_boot <- best_params$par - 1.96 * se_boot
    upper_boot <- best_params$par + 1.96 * se_boot
    coverage_mat[mm, 1:2] <- ifelse(lower_boot  < c(beta_0, beta_1) &
                                 upper_boot  > c(beta_0, beta_1),
                                 1, 0)
    ## #####################


    
    

    print("best params")
    print(best_params$par)

    ## Likelihood profiles
    ## lp_ests <- binary_likelihood_profs(best_pars = best_params$par,
    ##                                    max_loglike = -best_params$value,
    ##                                    obs_data_summary =
    ##                                        binary_cluster_summaries,
    ##                                    mc_samples_summary = mc_trees,
    ##                                    alpha = .05)

#    print("95% CI from LP")
  #  print(lp_ests)
    print("95% CI from FI")
    print(cbind(lower, upper))

    ## coverage_mat[mm, 1:2] <- ifelse(lp_ests[,2] < c(beta_0, beta_1) &
    ##                              lp_ests[,3] > c(beta_0, beta_1),
    ##                              1, 0)


    print("Data simulation time (hrs)")
    print((proc.time()[3] - t_sims) / 3600, digits = 2)

    print("Cumulative time (hrs)")
    print((proc.time()[3] - total_time) / 3600, digits = 2)

  }

  print("Params simulation time (hrs)")
  print((proc.time()[3] - t_params) / 3600, digits = 2)


  cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n ",
          beta_0, beta_1, gamma))
  print("Mean estimates")
  print(colMeans(best_params_mat))
   print("Median estimates")
 print( apply(best_params_mat, 2, median))
  print("2.5Q")
print(  apply(best_params_mat, 2, quantile, probs = .025))
   print("2.5Q")
  print(  apply(best_params_mat, 2, quantile, probs = .975))

  print("Coverage beta0, beta1")
  print(colMeans(coverage_mat))

  print("Cluster sizes:")
  cluster_size_summary <- quantile(cluster_sizes, probs = c(.5, .9, 1))
  print(cluster_size_summary)

  sims_list <- list(best_params_mat = best_params_mat,
                      set = zz,
                    max_cluster_sizes = max_cluster_sizes,
                    coverage = colMeans(coverage_mat),
                    cluster_size_summary = cluster_size_summary,
                    n_boot = n_boot)

  out_list[[zz]] <- sims_list

}


current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn <- paste0("simulation_results_boot_", current_time, ".RDS")


saveRDS(out_list, fn)
skgallagher/InfectionTrees documentation built on July 28, 2021, 2:14 p.m.