knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
#library(calibR)
This code was created by the DARTH workgroup (www.darthworkgroup.com). When using or modifying this code, please do so with attribution and cite our publications:
Alarid-Escudero F, Maclehose RF, Peralta Y, Kuntz KM, Enns EA. Non-identifiability in model calibration and implications for medical decision making. Med Decis Making. 2018; 38(7):810-821.
Jalal H, Pechlivanoglou P, Krijkamp E, Alarid-Escudero F, Enns E, Hunink MG. An Overview of R in Health Decision Sciences. Med Decis Making. 2017; 37(3): 735-746.
A walkthrough of the code could be found in the follwing link: - https://darth-git.github.io/calibSMDM2018-materials/
Model: 3-State Cancer Relative Survival (CRS) Markov Model
Inputs to be calibrated: p_Mets, p_DieMets
Targets: Surv
Directed search using: - Gradient-based (GB) (uses derivatives) - Nelder-Mead (NM) algorithm, aka simplex method (derivative free) - Global optimization techniques: - Simulated Annealing (SANN) - Genetic algorithms (GA)
# General purposes: pacman::p_load(devtools, testthat, usethis, tidyverse) devtools::load_all() # Calibration: pacman::p_load(lhs) # Sampling pacman::p_load(DEoptim) # Genetic Algorithm pacman::p_load(numDeriv) # Hessian approximation # Visualisation: pacman::p_load(plotrix, psych)
load(file.path(here::here(), "data", "CRS_targets.rda")) lst_targets <- CRS_targets
# Survival ("Surv") targets_pt = ggplot(data = lst_targets$Surv, aes(x = time, y = value)) + geom_errorbar(aes(ymin = lb, ymax = ub)) + geom_point() + theme( panel.border = element_rect(fill = NA, color = 'black') ) + labs(title = "Calibration target", x = "Time", y = "Proportion survived") targets_pt
# Check that the model works: v_params_test <- c(p_Mets = 0.10, p_DieMets = 0.05) # true values # Check that inputs are handled correctly, by using true values: compare(CRS_markov(v_params = v_params_test), CRS_markov()) # Compare outputs of default values to the targets: targets_pt + geom_line(data = as_tibble(CRS_markov()) %>% mutate(time = cbind(lst_targets$Surv$time)), aes(x = time, y = Surv), color = 'red') + scale_color_manual(values = c('Surv' = "red"))
# Specify seed (for reproducible sequence of random numbers): seed_no = 1 set.seed(seed_no) # Number of initial starting points - Nelder-mead: n_init <- 100 # Number of random samples: n_samples <- 1000 # Names and number of input parameters to be calibrated: v_params_names <- c("p_Mets", "p_DieMets") n_params <- length(v_params_names) # Range on input search space: lb <- c(p_Mets = 0.04, p_DieMets = 0.04) # lower bound ub <- c(p_Mets = 0.16, p_DieMets = 0.16) # upper bound # Number of calibration targets: v_target_names <- c("Surv") n_target <- length(v_target_names)
## Goodness-of-fit functions to pass to the algorithms: ## Log likelihood (LLK) goodness-of-fit function: f_gof_llk <- function(.v_params, .lst_targets = lst_targets) { ### Run model for a given parameter set: model_res <- CRS_markov(v_params = .v_params) ### Calculate goodness-of-fit of model outputs to targets: v_GOF <- numeric(n_target) ### TARGET 1: Survival ("Surv") ### log likelihood: v_GOF[1] <- sum(dnorm(x = .lst_targets$Surv$value, mean = model_res$Surv, sd = .lst_targets$Surv$se, log = TRUE)) ### Overall fit: ### Different targets can have different weights: v_weights <- rep(1, n_target) ### Weighted sum: GOF_overall <- sum(v_GOF[1:n_target] * v_weights) ### return GOF return(GOF_overall) } ## Sum of squared errors (SSE) goodness-of-fit function: f_gof_sse <- function(.v_params, .lst_targets = lst_targets) { ### Run model for a given parameter set: model_res <- CRS_markov(v_params = .v_params) ### Calculate goodness-of-fit of model outputs to targets: v_GOF <- numeric(n_target) ### TARGET 1: Survival ("Surv") ### Weighted sum of squared errors: w <- 1/(.lst_targets$Surv$se^2) v_GOF[1] <- -sum(w * (.lst_targets$Surv$value - model_res$Surv)^2) ### Overall fit: ### Different targets can have different weights: v_weights <- rep(1, n_target) ### Weighted sum: GOF_overall <- sum(v_GOF[1:n_target] * v_weights) ### return GOF return(GOF_overall) } ## Uncertainty estimates from the hessian matrix: summ_optim <- function(.name = v_params_names, .best, .par, .func = NULL, .hessian = NULL, .iter = 1) { stopifnot((!is.null(.hessian) | !is.null(.func))) # sigma.mat <- solve(-.hessian) # cor.mat <- cov2cor(sigma.mat) # sd.vec <- sqrt(diag(sigma.mat)) # upper <- .par + 2 * sd.vec # lower <- .par - 2 * sd.vec # fisher_info <- solve(-.hessian) # prop_sigma <- sqrt(diag(fisher_info)) # prop_sigma <- diag(prop_sigma) # upper<- .par + 1.96 * prop_sigma # lower<- .par - 1.96 * prop_sigma # Approximate the hessian matrix if not estimated by the package: if(is.null(.hessian)) { .hessian <- hessian(func = .func, x = .par) } # Estimate Fisher Information Matrix (FIM): fisher_info <- solve(.hessian) # Negative numbers don't have real square roots, correct if diag is < 0: # If optim minimised GOF then we need negative hessian: if(any(diag(fisher_info) < 0)) fisher_info = -fisher_info # Get the standard errors: prop_se <- sqrt(diag(fisher_info)) # Calculate confidence interval: upper <- .par + 1.96 * prop_se lower <- .par - 1.96 * prop_se return(tibble(Params = .name, Best = .best, Estimate = .par, Lower = lower, Upper = upper, Iteration = .iter)) }
## Sample multiple random starting values for some of the algorithms: v_params_init <- matrix(nrow = n_init, ncol = n_params) for (i in 1:n_params){ v_params_init[, i] <- runif(n_init, min=lb[i], max=ub[i]) } colnames(v_params_init) <- v_params_names res_all = list()
res_llk = list() res_sse = list() set.seed(seed_no) ## Record start time of calibration: t_init <- Sys.time() ## Run Gradient-based for each starting point: m_calib_res_llk <- m_calib_res_sse <- matrix(nrow = n_init, ncol = n_params + 1) colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- c(v_params_names, "Overall_fit") ## Use Gradient-based as method in the optim() function: for (j in 1:n_init) { fit_gb <- optim(par = v_params_init[j, ], fn = f_gof_llk, # GOF is log likelihood method = "BFGS", control = list(fnscale = -1, # switch to maximisation maxit = 1000), # maximum iterations hessian = TRUE) m_calib_res_llk[j, ] <- c(fit_gb$par, fit_gb$value) res_llk[j] <- list(summ_optim(.best = fit_gb$value, .iter = j, .par = fit_gb$par, .hessian = fit_gb$hessian)) } res_all <- append(res_all, list('GB_llk' = res_llk %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) for (j in 1:n_init) { fit_gb <- optim(par = v_params_init[j, ], fn = f_gof_sse, # GOF is sum of squared errors method = "BFGS", control = list(fnscale = -1, # switch to maximisation maxit = 1000), # maximum iterations hessian = TRUE) m_calib_res_sse[j, ] <- c(fit_gb$par, fit_gb$value) res_sse[j] <- list(summ_optim(.best = fit_gb$value, .iter = j, .par = fit_gb$par, .hessian = fit_gb$hessian)) } res_all <- append(res_all, list('GB_sse' = res_sse %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) # Calculate computation time: comp_time <- Sys.time() - t_init comp_time
## Sort in descending order for LLK and ascending for SSE: m_calib_res_llk <- m_calib_res_llk %>% as_tibble() %>% arrange(desc(Overall_fit)) m_calib_res_sse <- m_calib_res_sse %>% as_tibble() %>% arrange(desc(Overall_fit)) ## Examine the top 10 best-fitting sets: m_calib_res_llk[1:10,] m_calib_res_sse[1:10,] # Plot the top 100 (top 10%): ggplot() + geom_point(data = m_calib_res_sse[1:100,], aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "SSE parameters") ggplot() + geom_point(data = m_calib_res_llk[1:100,], aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "Likelihood parameters") # Pairwise comparison of top 100 sets: pairs.panels(m_calib_res_llk[1:100, v_params_names]) pairs.panels(m_calib_res_sse[1:100, v_params_names]) # Plot model-predicted output at best set vs targets: v_out_best_llk <- CRS_markov(m_calib_res_llk[1,]) v_out_best_sse <- CRS_markov(m_calib_res_sse[1,]) targets_pt + geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>% mutate('SSE' = v_out_best_sse$Surv, 'time' = cbind(lst_targets$Surv$time)) %>% pivot_longer(cols = -time, names_to = 'GOF', values_to = 'Survival'), aes(x = time, y = Survival, color = GOF, shape = GOF), alpha = 0.5)
res_llk = list() res_sse = list() set.seed(seed_no) ## Record start time of calibration: t_init <- Sys.time() ## Run Nelder-Mead for each starting point: m_calib_res_llk <- m_calib_res_sse <- matrix(nrow = n_init, ncol = n_params + 1) colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- c(v_params_names, "Overall_fit") ## Use Nelder-Mead as method in the optim() function: for (j in 1:n_init) { fit_nm <- optim(par = v_params_init[j, ], fn = f_gof_llk, # GOF is log likelihood method = "Nelder-Mead", control = list(fnscale = -1, # switch to maximisation maxit = 1000), # maximum iterations hessian = TRUE) m_calib_res_llk[j, ] <- c(fit_nm$par, fit_nm$value) res_llk[j] <- list(summ_optim(.best = fit_nm$value, .iter = j, .par = fit_nm$par, .hessian = fit_nm$hessian)) } res_all <- append(res_all, list('NM_llk' = res_llk %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) for (j in 1:n_init) { fit_nm <- optim(par = v_params_init[j, ], fn = f_gof_sse, # GOF is sum of squared errors method = "Nelder-Mead", control = list(fnscale = -1, # switch to maximisation maxit = 1000), # maximum iterations hessian = TRUE) m_calib_res_sse[j, ] <- c(fit_nm$par, fit_nm$value) res_sse[j] <- list(summ_optim(.best = fit_nm$value, .iter = j, .par = fit_nm$par, .hessian = fit_nm$hessian)) } res_all <- append(res_all, list('NM_sse' = res_sse %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) # Calculate computation time: comp_time <- Sys.time() - t_init comp_time
## Sort in descending order for LLK and ascending for SSE: m_calib_res_llk <- m_calib_res_llk %>% as_tibble() %>% arrange(desc(Overall_fit)) m_calib_res_sse <- m_calib_res_sse %>% as_tibble() %>% arrange(desc(Overall_fit)) ## Examine the top 10 best-fitting sets: m_calib_res_llk[1:10,] m_calib_res_sse[1:10,] # Plot the top 100 (top 10%): ggplot() + geom_point(data = m_calib_res_sse[1:100,], aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "SSE parameters") ggplot() + geom_point(data = m_calib_res_llk[1:100,], aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "Likelihood parameters") # Pairwise comparison of top 100 sets: pairs.panels(m_calib_res_llk[1:100, v_params_names]) pairs.panels(m_calib_res_sse[1:100, v_params_names]) # Plot model-predicted output at best set vs targets: v_out_best_llk <- CRS_markov(m_calib_res_llk[1,]) v_out_best_sse <- CRS_markov(m_calib_res_sse[1,]) targets_pt + geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>% mutate('SSE' = v_out_best_sse$Surv, 'time' = cbind(lst_targets$Surv$time)) %>% pivot_longer(cols = -time, names_to = 'GOF', values_to = 'Survival'), aes(x = time, y = Survival, color = GOF, shape = GOF), alpha = 0.5)
res_llk = list() res_sse = list() set.seed(seed_no) ## Record start time of calibration: t_init <- Sys.time() ## Run Simulated Annealing for each starting point: m_calib_res_llk <- m_calib_res_sse <- matrix(nrow = n_init, ncol = n_params + 1) colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- c(v_params_names, "Overall_fit") ## Use Simulated Annealing as method in the optim() function: for (j in 1:n_init) { fit_sa <- optim(par = v_params_init[j, ], fn = f_gof_llk, # GOF is log likelihood method = "SANN", control = list(fnscale = -1, # switch to maximisation temp = 10, # algorithm tuning parameters tmax = 10, # algorithm tuning parameters maxit = 1000), # maximum iterations hessian = TRUE) m_calib_res_llk[j, ] <- c(fit_sa$par, fit_sa$value) res_llk[j] <- list(summ_optim(.best = fit_sa$value, .iter = j, .par = fit_sa$par, .hessian = fit_sa$hessian)) } res_all <- append(res_all, list('SA_llk' = res_llk %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) for (j in 1:n_init) { fit_sa <- optim(par = v_params_init[j, ], fn = f_gof_sse, # GOF is sum of squared errors method = "SANN", control = list(fnscale = -1, # switch to maximisation temp = 10, # algorithm tuning parameters tmax = 10, # algorithm tuning parameters maxit = 1000), # maximum iterations hessian = TRUE) m_calib_res_sse[j, ] <- c(fit_sa$par, fit_sa$value) res_sse[j] <- list(summ_optim(.best = fit_sa$value, .iter = j, .par = fit_sa$par, .hessian = fit_sa$hessian)) } res_all <- append(res_all, list('SA_sse' = res_sse %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) # Calculate computation time: comp_time <- Sys.time() - t_init comp_time
## Sort in descending order for LLK and ascending for SSE: m_calib_res_llk <- m_calib_res_llk %>% as_tibble() %>% arrange(desc(Overall_fit)) m_calib_res_sse <- m_calib_res_sse %>% as_tibble() %>% arrange(desc(Overall_fit)) ## Examine the top 10 best-fitting sets: m_calib_res_llk[1:10,] m_calib_res_sse[1:10,] # Plot the top 100 (top 10%): ggplot() + geom_point(data = m_calib_res_sse[1:100,], aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "SSE parameters") ggplot() + geom_point(data = m_calib_res_llk[1:100,], aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "Likelihood parameters") # Pairwise comparison of top 100 sets: pairs.panels(m_calib_res_llk[1:100, v_params_names]) pairs.panels(m_calib_res_sse[1:100, v_params_names]) # Plot model-predicted output at best set vs targets: v_out_best_llk <- CRS_markov(m_calib_res_llk[1,]) v_out_best_sse <- CRS_markov(m_calib_res_sse[1,]) targets_pt + geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>% mutate('SSE' = v_out_best_sse$Surv, 'time' = cbind(lst_targets$Surv$time)) %>% pivot_longer(cols = -time, names_to = 'GOF', values_to = 'Survival'), aes(x = time, y = Survival, color = GOF, shape = GOF), alpha = 0.5)
res_llk = list() res_sse = list() set.seed(seed_no) ## Record start time of calibration: t_init <- Sys.time() ## Run Genetic Algorithm for each starting point: m_calib_res_llk <- m_calib_res_sse <- matrix(nrow = 1, ncol = n_params + 1) colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- c(v_params_names, "Overall_fit") ## Define fitness functions to return minimis-able values in DEoptim: fitness_llk = function(.params) { # Log Likelihood (LLK) names(.params) <- v_params_names return(-1 * f_gof_llk(.params)) } fitness_sse = function(.params) { # Sum of Squared Errors (SSE) names(.params) <- v_params_names return(-1 * f_gof_sse(.params)) } ## Use Genetic Algorithm through the DEoptim package: fit_ga <- DEoptim(fitness_llk, # LLK lower = lb, upper = ub, control = DEoptim.control(trace = FALSE)) m_calib_res_llk[1, ] <- c(fit_ga$optim$bestmem, -fit_ga$optim$bestval) res_llk <- list(summ_optim(.best = -fit_ga$optim$bestval, .par = fit_ga$optim$bestmem, .hessian = NULL, .func = fitness_llk)) res_all <- append(res_all, list('GA_llk' = res_llk %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) fit_ga <- DEoptim(fitness_sse, # SSE lower = lb, upper = ub, control = DEoptim.control(trace = FALSE)) m_calib_res_sse[1, ] <- c(fit_ga$optim$bestmem, -fit_ga$optim$bestval) res_sse <- list(summ_optim(.best = -fit_ga$optim$bestval, .par = fit_ga$optim$bestmem, .hessian = NULL, .func = fitness_llk)) res_all <- append(res_all, list('GA_sse' = res_sse %>% map_dfr(.f = ~ .x) %>% pivot_wider(id_cols = c(Iteration, Best), names_from = Params, values_from = c(Estimate, Lower, Upper)))) # Calculate computation time: comp_time <- Sys.time() - t_init comp_time
## Give the outputs appropriate names: m_calib_res_llk <- m_calib_res_llk %>% as_tibble() m_calib_res_sse <- m_calib_res_sse %>% as_tibble() ## Examine the best-fitting set: m_calib_res_llk m_calib_res_sse # Plot the top 100 (top 10%): ggplot() + geom_point(data = m_calib_res_sse, aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "SSE parameters") ggplot() + geom_point(data = m_calib_res_llk, aes(x = p_Mets, y = p_DieMets)) + theme( panel.border = element_rect(fill = NA, colour = 'black') ) + labs(title = "Likelihood parameters") # Pairwise comparison of top 100 sets: pairs.panels(m_calib_res_llk[, 1:2]) pairs.panels(m_calib_res_sse[, 1:2]) # Plot model-predicted output at best set vs targets: v_out_best_llk <- CRS_markov(m_calib_res_llk[1,]) v_out_best_sse <- CRS_markov(m_calib_res_sse[1,]) targets_pt + geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>% mutate('SSE' = v_out_best_sse$Surv, 'time' = cbind(lst_targets$Surv$time)) %>% pivot_longer(cols = -time, names_to = 'GOF', values_to = 'Survival'), aes(x = time, y = Survival, color = GOF, shape = GOF), alpha = 0.5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.