#!/usr/bin/env Rscript
## SKG
## Aug. 20, 2020
## Now with the outsiders
## Simulate outsider outbreaks
## See if we can estimate the parameters
## ## Updated June 21, 2021 forJCGS revisions
## ##
print("WARNING:")
print("With the given initial parameters, this script will take ~two weeks to run")
print("As such they have been changed so this will run fully")
print("To reproduce paper results, switch to initial settings")
## Load libraries and data
## devtools::load_all()
if(!require(InfectionTrees)){
devtools::install_github("skgallagher/InfectionTrees")
}
library(InfectionTrees)
library(tidyverse)
## simulation parameters
K <- 100 # number of clusters to simulate, init is 1000
n_sims <- 2 # total times to simulate per set of parameters
B <- 500 # number of MC draws per cluster, init is 10000
## EACH OUTER LOOP TAKES >30 HOURS.
params_df <- data.frame(beta0 = c(-2.5, -2.5,
-2.75, -2.75,
-2.5, -2.5,
-2.5, -2.5,
-1.5, -1.5,
-1.5, -1.5,
-1.5, -2.75,
-1.5),
beta1 = c(-.5, -.5,
0, -.25,
0, 0,
.5, .5,
-.5, -1,
0, 0,
.5, 2.25,
1),
gamma = c(.5, .7,
.1, .1,
.5, .7,
.5, .7,
.7, .5,
.7, .5,
.7, .1,
.5))
params_df$set <- 1:nrow(params_df)
simulation_output <- vector(mode = "list", length = nrow(params_df))
for(par_index in 1:nrow(params_df)){
my_seed <- 8152020 + par_index
set.seed(my_seed) # needs fixed?
t0 <- proc.time()[3]
gamma <- .7
beta0 <- params_df$beta0[par_index]
beta1 <- params_df$beta1[par_index]
gamma <- params_df$gamma[par_index]
print(sprintf("beta0 = %.2f, beta1 = %.2f, gamma = %.2f",
beta0, beta1, gamma))
inf_params <- c(beta0, beta1)
covariates_df <- data.frame(x = c(rep(1, gamma * 10),
rep(0, (1-gamma) * 10)
))
covariate_names <- "x"
## Simulate an outbreak
best_params_mat <- matrix(0, nrow = n_sims, ncol = length(inf_params))
cluster_sizes_list <- vector(mode = "list", length = n_sims)
t_init <- proc.time()[3]
coverage_mat <- matrix(0, nrow = n_sims, ncol = 2)
n_singular <- 0
for(sim in 1:n_sims){
print("simulation number")
print(sim)
print("simulating data set")
df <- simulate_bp(K = K,
inf_params = inf_params,
sample_covariates_df = covariates_df,
covariate_names = covariate_names,
max_size = 60)
outsider_obs <- df %>%
filter(gen != 1) %>%
mutate(cluster_size = cluster_size - 1)
if(nrow(outsider_obs) == 0){
print("Have not sampled any trees with at least 2 people. Moving to next instance")
next
}
## Number of 'observed' clusters
length(unique(outsider_obs$cluster_id))
cluster_size_df <- outsider_obs %>% dplyr::group_by(cluster_id) %>%
summarize(cluster_size = n(),
.groups = "drop")
cluster_sizes_list[[sim]] <- cluster_size_df$cluster_size
print("sampling MC trees")
## sample MC trees
t0 <- proc.time()[3]
mc_trees <- sample_mc_trees(outsider_obs,
B = B,
multiple_outside_transmissions = TRUE)
print(proc.time()[3] - t0)
## Optimize
print("Optimizing")
init_params <- rep(0, length(covariate_names) + 1)
bds <- rep(-5, length(covariate_names) + 1)
if(length(covariate_names) > 5){
bds <- rep(-4, length(covariate_names) + 1)
}
lower_bds <- bds
upper_bds <- -bds
cov_mat <- covariate_df_to_mat(mc_trees,
cov_names = covariate_names)
## Now with data.table
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = general_loglike,
mc_trees = data.table::as.data.table(mc_trees),
return_neg = TRUE,
cov_mat = cov_mat,
cov_names = covariate_names,
use_outsider_prob = FALSE,
multiple_outside_transmissions = TRUE,
method = "L-BFGS-B",
lower = lower_bds,
upper = upper_bds,
hessian = TRUE
)
#######################################
## Get a 95% CI using the fisher info
## ## New for JCGS
se <- tryCatch({
sqrt(diag(solve(best_params$hessian)))
},
error = function(e){
print("ERROR trying to estimate FI")
return(c(NA, NA))
})
n_singular <- ifelse(any(is.na(se)), n_singular + 1, n_singular)
print("SE")
print(se)
lower <- best_params$par - 1.96 * se
upper <- best_params$par + 1.96 * se
coverage_mat[sim, 1:2] <- ifelse(lower < c(beta0, beta1) &
upper > c(beta0, beta1),
1, 0)
#################################
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
print("best params:")
print(best_params$par)
best_params_mat[sim,] <- best_params$par
print(paste("Total time:", round( (proc.time()[3] - t_init) / 3600, 3),
"hrs"))
}
coverage <- colMeans(coverage_mat)
means <- colMeans(best_params_mat, na.rm = TRUE)
medians <- apply(best_params_mat, 2, median, na.rm = TRUE)
sd <- apply(best_params_mat, 2, sd, na.rm = TRUE)
cluster_sizes_vec <- do.call("c",
cluster_sizes_list)
cluster_med <- median(cluster_sizes_vec, na.rm = TRUE)
cluster_max <- max(cluster_sizes_vec, na.rm = TRUE)
cluster_90 <- quantile(cluster_sizes_vec, probs = .90, na.rm = TRUE)
names(cluster_90) <- NULL
data_out <- list(best_params_mat = best_params_mat,
seed = my_seed,
init_params = inf_params,
gamma = gamma,
set = par_index,
par_set = params_df[par_index,],
cluster_sizes = c("med" = cluster_med,
"max" = cluster_max,
"q90" = cluster_90),
coverage = coverage)
simulation_output[[par_index]] <- data_out
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn_base <- paste0("single_output_outsider_", "set_",
par_index,
current_time, ".RDS")
saveRDS(data_out, fn_base)
}
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn_base <- paste0("all_output_outsider_", current_time, ".RDS")
saveRDS(simulation_output, fn_base)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.