#!/usr/bin/env Rscript
## SKG
## Aug. 26, 2020
## Bootstrap simulations to estimate standard error for the data
## 5 different models
## Updated June 22, 2021
## Load libraries and data
if("InfectionTrees" %in% rownames(installed.packages())){
remove.packages("InfectionTrees")
}
devtools::install_github("skgallagher/InfectionTrees")
library(InfectionTrees)
library(tidyverse)
data(tb_clean)
for(use_mot in c(FALSE, TRUE)){
print("")
mod <- ifelse(use_mot,
"Multiple Outside Transmissions",
"Base model")
print(mod)
## simulation parameters
K <- 10000 # of MC trees to draw for each cluster
n_models <- 1
n_boot <- 100 # number of bootstrap simulations
my_seed <- 8262020 + ifelse(use_mot, 1000, 0)
## Each inner loop takes ~1 hr
## Format original clusters to useful variables
orig_clusters <- tb_clean %>%
dplyr::mutate(smear = ifelse(spsmear == "Positive",
1, 0),
cluster_id = group,
hiv_f = ifelse(hivstatus == "Negative", "neg",
ifelse(hivstatus == "Positive", "pos",
"unk"))) %>%
dplyr::mutate(hiv_neg_pos = ifelse(hiv_f == "neg", 1, 0),
hiv_unk_pos = ifelse(hiv_f == "unk", 1, 0)) %>%
dplyr::group_by(cluster_id) %>%
dplyr::mutate(rel_time = as.numeric(rel_time / 365)) %>%
dplyr::mutate(cluster_size = dplyr::n()) %>%
dplyr::ungroup() %>%
mutate(race_f = fct_collapse(race,
white = "White",
black = "Black or African American",
asian = "Asian")) %>%
mutate(race_asian_white = ifelse(race_f == "asian", 1, 0),
race_black_white = ifelse(race_f == "black", 1, 0)) %>%
select(cluster_id, smear,
hiv_neg_pos,
hiv_unk_pos,
rel_time,
race_asian_white,
race_black_white,
cluster_size)
## models
covariate_names <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time")
beta_mat <- matrix(0, ncol = length(covariate_names) + 1,
nrow = n_boot)
colnames(beta_mat) <- c("Intercept", covariate_names)
t_init <- proc.time()[3]
for(nn in 1:n_boot){
set.seed(my_seed + nn)
print(sprintf("Bootstrap simulation %d", nn))
## Sample a bootstrap cluster
b_clusters <- bootstrap_clusters(clusters = orig_clusters)
print(sprintf("sampling %d MC trees", K))
## sample MC trees
t0 <- proc.time()[3]
## Sample all MC clusters at once for each of the 5 models
mc_trees <- sample_mc_trees(b_clusters,
B = K,
multiple_outside_transmissions = use_mot,
covariate_names = covariate_names)
print(proc.time()[3] - t0)
print("Model:")
print(covariate_names)
if(is.na(covariate_names[1])){
init_params <- 0
} else{
init_params <- rep(0, length(covariate_names) + 1)
}
## Optimize
print("Optimizing")
bds <- rep(-5, length(init_params))
if(length(covariate_names) > 5){
bds <- rep(-4, length(init_params))
}
lower_bds <- bds
upper_bds <- -bds
cov_mat <- covariate_df_to_mat(mc_trees,
cov_names = covariate_names)
## Now with data.table
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = general_loglike,
mc_trees = data.table::as.data.table(mc_trees),
return_neg = TRUE,
cov_mat = cov_mat,
cov_names = covariate_names,
use_outsider_prob = FALSE,
multiple_outside_transmissions = use_mot,
method = "L-BFGS-B",
lower = lower_bds,
upper = upper_bds
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
print("best params:")
print(best_params$par)
beta_mat[nn,] <- best_params$par
print(paste("Total time:", round( (proc.time()[3] - t_init) / 3600, 3),
"hrs"))
}
se_vec <- apply(beta_mat, 2, sd)
print("Bootstrap sample error matrices")
print(se_vec)
output_list <- list(covariate_names = covariate_names,
se_vec = se_vec,
beta_mat = beta_mat,
n_boot = n_boot,
mot = use_mot)
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn_base <- paste0("bootstrap_sims_",
ifelse(use_mot, "mot_", "base_"),
current_time, ".RDS")
saveRDS(output_list, fn_base)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.