knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
#library(calibR)

This code was created by the DARTH workgroup (www.darthworkgroup.com). When using or modifying this code, please do so with attribution and cite our publications:

Alarid-Escudero F, Maclehose RF, Peralta Y, Kuntz KM, Enns EA. Non-identifiability in model calibration and implications for medical decision making. Med Decis Making. 2018; 38(7):810-821.

Jalal H, Pechlivanoglou P, Krijkamp E, Alarid-Escudero F, Enns E, Hunink MG. An Overview of R in Health Decision Sciences. Med Decis Making. 2017; 37(3): 735-746.

A walkthrough of the code could be found in the follwing link: - https://darth-git.github.io/calibSMDM2018-materials/

Calibration Specifications

Model: 3-State Cancer Relative Survival (CRS) Markov Model Inputs to be calibrated: p_Mets, p_DieMets Targets: Surv

Search method:

Directed search using: - Gradient-based (GB) (uses derivatives) - Nelder-Mead (NM) algorithm, aka simplex method (derivative free) - Global optimization techniques: - Simulated Annealing (SANN) - Genetic algorithms (GA)

Goodness-of-fit measure:

Load packages and functions:

# General purposes:
pacman::p_load(devtools, testthat, usethis, tidyverse)
devtools::load_all()
# Calibration:
pacman::p_load(lhs) # Sampling
pacman::p_load(DEoptim) # Genetic Algorithm
pacman::p_load(numDeriv) # Hessian approximation
# Visualisation:
pacman::p_load(plotrix, psych)

Load target data:

load(file.path(here::here(), "data", "CRS_targets.rda"))
lst_targets <- CRS_targets

Visualise target data:

# Survival ("Surv")
targets_pt = ggplot(data = lst_targets$Surv,
                    aes(x = time,
                        y = value)) +
  geom_errorbar(aes(ymin = lb, ymax = ub)) +
  geom_point() +
  theme(
    panel.border = element_rect(fill = NA, color = 'black')
  ) + 
  labs(title = "Calibration target",
       x = "Time",
       y = "Proportion survived")
targets_pt

The model is defined as a function, testing that it works:

# Check that the model works:
v_params_test <- c(p_Mets = 0.10, p_DieMets = 0.05) # true values
# Check that inputs are handled correctly, by using true values:
compare(CRS_markov(v_params = v_params_test), CRS_markov())

# Compare outputs of default values to the targets:
targets_pt +
  geom_line(data = as_tibble(CRS_markov()) %>%
              mutate(time = cbind(lst_targets$Surv$time)), 
            aes(x = time, y = Surv),
            color = 'red') +
  scale_color_manual(values = c('Surv' = "red"))

Specify calibration parameters:

# Specify seed (for reproducible sequence of random numbers):
seed_no = 1
set.seed(seed_no)

# Number of initial starting points - Nelder-mead:
n_init <- 100

# Number of random samples:
n_samples <- 1000

# Names and number of input parameters to be calibrated:
v_params_names <- c("p_Mets", "p_DieMets")
n_params <- length(v_params_names)

# Range on input search space:
lb <- c(p_Mets = 0.04, p_DieMets = 0.04) # lower bound
ub <- c(p_Mets = 0.16, p_DieMets = 0.16) # upper bound

# Number of calibration targets:
v_target_names <- c("Surv")
n_target <- length(v_target_names)

Calibrate:

Write a calibration function to pass into the optimisation algorithm:

## Goodness-of-fit functions to pass to the algorithms:
## Log likelihood (LLK) goodness-of-fit function:
f_gof_llk <- function(.v_params, .lst_targets = lst_targets) {

  ### Run model for a given parameter set:
  model_res <- CRS_markov(v_params = .v_params)

  ### Calculate goodness-of-fit of model outputs to targets:
  v_GOF <- numeric(n_target)
  ### TARGET 1: Survival ("Surv")
  ### log likelihood:
  v_GOF[1] <- sum(dnorm(x = .lst_targets$Surv$value,
                        mean = model_res$Surv,
                        sd = .lst_targets$Surv$se,
                        log = TRUE))

  ### Overall fit:
  ### Different targets can have different weights:
  v_weights <- rep(1, n_target)
  ### Weighted sum:
  GOF_overall <- sum(v_GOF[1:n_target] * v_weights)

  ### return GOF
  return(GOF_overall)
}

## Sum of squared errors (SSE) goodness-of-fit function:
f_gof_sse <- function(.v_params, .lst_targets = lst_targets) {

  ### Run model for a given parameter set:
  model_res <- CRS_markov(v_params = .v_params)

  ### Calculate goodness-of-fit of model outputs to targets:
  v_GOF <- numeric(n_target)
  ### TARGET 1: Survival ("Surv")
  ### Weighted sum of squared errors:
  w <- 1/(.lst_targets$Surv$se^2)
  v_GOF[1] <- -sum(w * (.lst_targets$Surv$value - model_res$Surv)^2)

  ### Overall fit:
  ### Different targets can have different weights:
  v_weights <- rep(1, n_target)
  ### Weighted sum:
  GOF_overall <- sum(v_GOF[1:n_target] * v_weights)

  ### return GOF
  return(GOF_overall)
}

## Uncertainty estimates from the hessian matrix:
summ_optim <- function(.name = v_params_names, .best, .par, .func = NULL,
                       .hessian = NULL, .iter = 1) {
  stopifnot((!is.null(.hessian) | !is.null(.func)))
  # sigma.mat <- solve(-.hessian)
  # cor.mat <- cov2cor(sigma.mat)
  # sd.vec <- sqrt(diag(sigma.mat))
  # upper <- .par + 2 * sd.vec
  # lower <- .par - 2 * sd.vec
  # fisher_info <- solve(-.hessian)
  # prop_sigma <- sqrt(diag(fisher_info))
  # prop_sigma <- diag(prop_sigma)
  # upper<- .par + 1.96 * prop_sigma
  # lower<- .par - 1.96 * prop_sigma
  # Approximate the hessian matrix if not estimated by the package:
  if(is.null(.hessian)) {
    .hessian <- hessian(func = .func, x = .par)
  }

  # Estimate  Fisher Information Matrix (FIM):
  fisher_info <- solve(.hessian)

  # Negative numbers don't have real square roots, correct if diag is < 0:
  # If optim minimised GOF then we need negative hessian: 
  if(any(diag(fisher_info) < 0)) fisher_info = -fisher_info

  # Get the standard errors:
  prop_se <- sqrt(diag(fisher_info))

  # Calculate confidence interval:
  upper <- .par + 1.96 * prop_se
  lower <- .par - 1.96 * prop_se

  return(tibble(Params = .name, Best = .best, Estimate = .par, 
                Lower = lower, Upper = upper, Iteration = .iter))
}

Calibration settings and sampling algorithms' initial values:

## Sample multiple random starting values for some of the algorithms:
v_params_init <- matrix(nrow = n_init, ncol = n_params)
for (i in 1:n_params){
  v_params_init[, i] <- runif(n_init, min=lb[i], max=ub[i])
}
colnames(v_params_init) <- v_params_names

res_all = list()

Gradient-based (GB) (uses derivatives):

Run the model across initial parameter sets:

res_llk = list()
res_sse = list()
set.seed(seed_no)

## Record start time of calibration:
t_init <- Sys.time()

## Run Gradient-based for each starting point:
m_calib_res_llk <- m_calib_res_sse <- 
  matrix(nrow = n_init, ncol = n_params + 1)
colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- 
  c(v_params_names, "Overall_fit")

## Use Gradient-based as method in the optim() function:
for (j in 1:n_init) {
  fit_gb <- optim(par = v_params_init[j, ], 
                  fn = f_gof_llk, # GOF is log likelihood
                  method = "BFGS",
                  control = list(fnscale = -1, # switch to maximisation
                                 maxit = 1000), # maximum iterations
                  hessian = TRUE)
  m_calib_res_llk[j, ] <- c(fit_gb$par, fit_gb$value)
  res_llk[j] <- list(summ_optim(.best = fit_gb$value, .iter = j,
                                .par = fit_gb$par, 
                                .hessian = fit_gb$hessian))
}

res_all <- append(res_all,
                  list('GB_llk' = res_llk %>% 
                         map_dfr(.f = ~ .x) %>% 
                         pivot_wider(id_cols = c(Iteration, Best), 
                                     names_from = Params, 
                                     values_from = c(Estimate, Lower,
                                                     Upper))))

for (j in 1:n_init) {
  fit_gb <- optim(par = v_params_init[j, ], 
                  fn = f_gof_sse, # GOF is sum of squared errors
                  method = "BFGS",
                  control = list(fnscale = -1, # switch to maximisation
                                 maxit = 1000), # maximum iterations
                  hessian = TRUE)
  m_calib_res_sse[j, ] <- c(fit_gb$par, fit_gb$value)
  res_sse[j] <- list(summ_optim(.best = fit_gb$value, .iter = j, 
                                .par = fit_gb$par, 
                                .hessian = fit_gb$hessian))
}

res_all <- append(res_all, 
                  list('GB_sse' = res_sse %>% 
                         map_dfr(.f = ~ .x) %>% 
                         pivot_wider(id_cols = c(Iteration, Best), 
                                     names_from = Params, 
                                     values_from = c(Estimate, Lower,
                                                     Upper))))

# Calculate computation time:
comp_time <- Sys.time() - t_init
comp_time

Exploring best-fitting input sets:

## Sort in descending order for LLK and ascending for SSE:   
m_calib_res_llk <- m_calib_res_llk %>% 
  as_tibble() %>% 
  arrange(desc(Overall_fit))
m_calib_res_sse <- m_calib_res_sse %>% 
  as_tibble() %>% 
  arrange(desc(Overall_fit))

## Examine the top 10 best-fitting sets:
m_calib_res_llk[1:10,]
m_calib_res_sse[1:10,]

# Plot the top 100 (top 10%):
ggplot() +
  geom_point(data = m_calib_res_sse[1:100,],
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "SSE parameters")

ggplot() +
  geom_point(data = m_calib_res_llk[1:100,],
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "Likelihood parameters")

# Pairwise comparison of top 100 sets:
pairs.panels(m_calib_res_llk[1:100, v_params_names])
pairs.panels(m_calib_res_sse[1:100, v_params_names])

# Plot model-predicted output at best set vs targets:
v_out_best_llk <- CRS_markov(m_calib_res_llk[1,])
v_out_best_sse <- CRS_markov(m_calib_res_sse[1,])

targets_pt +
  geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>%
               mutate('SSE' = v_out_best_sse$Surv,
                      'time' = cbind(lst_targets$Surv$time)) %>% 
               pivot_longer(cols = -time, names_to = 'GOF', 
                            values_to = 'Survival'),
             aes(x = time, y = Survival, color = GOF, shape = GOF), 
             alpha = 0.5)

Nelder-Mead (NM) algorithm, aka simplex method (derivative free):

Run the model across initial parameter sets:

res_llk = list()
res_sse = list()
set.seed(seed_no)

## Record start time of calibration:
t_init <- Sys.time()

## Run Nelder-Mead for each starting point:
m_calib_res_llk <- m_calib_res_sse <- 
  matrix(nrow = n_init, ncol = n_params + 1)
colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- 
  c(v_params_names, "Overall_fit")

## Use Nelder-Mead as method in the optim() function:
for (j in 1:n_init) {
  fit_nm <- optim(par = v_params_init[j, ], 
                  fn = f_gof_llk, # GOF is log likelihood
                  method = "Nelder-Mead",
                  control = list(fnscale = -1, # switch to maximisation
                                 maxit = 1000), # maximum iterations
                  hessian = TRUE)
  m_calib_res_llk[j, ] <- c(fit_nm$par, fit_nm$value)
  res_llk[j] <- list(summ_optim(.best = fit_nm$value, .iter = j,
                                .par = fit_nm$par, 
                                .hessian = fit_nm$hessian))
}

res_all <- append(res_all,
                  list('NM_llk' = res_llk %>% 
                         map_dfr(.f = ~ .x) %>% 
                         pivot_wider(id_cols = c(Iteration, Best), 
                                     names_from = Params, 
                                     values_from = c(Estimate, Lower,
                                                     Upper))))

for (j in 1:n_init) {
  fit_nm <- optim(par = v_params_init[j, ], 
                  fn = f_gof_sse, # GOF is sum of squared errors
                  method = "Nelder-Mead",
                  control = list(fnscale = -1, # switch to maximisation
                                 maxit = 1000), # maximum iterations
                  hessian = TRUE)
  m_calib_res_sse[j, ] <- c(fit_nm$par, fit_nm$value)
  res_sse[j] <- list(summ_optim(.best = fit_nm$value, .iter = j,
                                .par = fit_nm$par, 
                                .hessian = fit_nm$hessian))
}

res_all <- append(res_all,
                  list('NM_sse' = res_sse %>% 
                         map_dfr(.f = ~ .x) %>% 
                         pivot_wider(id_cols = c(Iteration, Best), 
                                     names_from = Params, 
                                     values_from = c(Estimate, Lower,
                                                     Upper))))

# Calculate computation time:
comp_time <- Sys.time() - t_init
comp_time

Exploring best-fitting input sets:

## Sort in descending order for LLK and ascending for SSE:   
m_calib_res_llk <- m_calib_res_llk %>% 
  as_tibble() %>% 
  arrange(desc(Overall_fit))
m_calib_res_sse <- m_calib_res_sse %>% 
  as_tibble() %>% 
  arrange(desc(Overall_fit))

## Examine the top 10 best-fitting sets:
m_calib_res_llk[1:10,]
m_calib_res_sse[1:10,]

# Plot the top 100 (top 10%):
ggplot() +
  geom_point(data = m_calib_res_sse[1:100,],
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "SSE parameters")

ggplot() +
  geom_point(data = m_calib_res_llk[1:100,],
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "Likelihood parameters")

# Pairwise comparison of top 100 sets:
pairs.panels(m_calib_res_llk[1:100, v_params_names])
pairs.panels(m_calib_res_sse[1:100, v_params_names])

# Plot model-predicted output at best set vs targets:
v_out_best_llk <- CRS_markov(m_calib_res_llk[1,])
v_out_best_sse <- CRS_markov(m_calib_res_sse[1,])

targets_pt +
  geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>%
               mutate('SSE' = v_out_best_sse$Surv,
                      'time' = cbind(lst_targets$Surv$time)) %>% 
               pivot_longer(cols = -time, names_to = 'GOF', 
                            values_to = 'Survival'),
             aes(x = time, y = Survival, color = GOF, shape = GOF), 
             alpha = 0.5)

Global optimization techniques:

Simulated Annealing (SANN):

Run the model across initial parameter sets:

res_llk = list()
res_sse = list()
set.seed(seed_no)

## Record start time of calibration:
t_init <- Sys.time()

## Run Simulated Annealing for each starting point:
m_calib_res_llk <- m_calib_res_sse <- 
  matrix(nrow = n_init, ncol = n_params + 1)
colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- 
  c(v_params_names, "Overall_fit")

## Use Simulated Annealing as method in the optim() function:
for (j in 1:n_init) {
  fit_sa <- optim(par = v_params_init[j, ], 
                  fn = f_gof_llk, # GOF is log likelihood
                  method = "SANN",
                  control = list(fnscale = -1, # switch to maximisation
                                 temp = 10, # algorithm tuning parameters
                                 tmax = 10, # algorithm tuning parameters
                                 maxit = 1000), # maximum iterations
                  hessian = TRUE)  
  m_calib_res_llk[j, ] <- c(fit_sa$par, fit_sa$value)
  res_llk[j] <- list(summ_optim(.best = fit_sa$value, .iter = j,
                                .par = fit_sa$par, 
                                .hessian = fit_sa$hessian))
}

res_all <- append(res_all,
                  list('SA_llk' = res_llk %>% 
                         map_dfr(.f = ~ .x) %>% 
                         pivot_wider(id_cols = c(Iteration, Best), 
                                     names_from = Params, 
                                     values_from = c(Estimate, Lower,
                                                     Upper))))
for (j in 1:n_init) {
  fit_sa <- optim(par = v_params_init[j, ], 
                  fn = f_gof_sse, # GOF is sum of squared errors
                  method = "SANN",
                  control = list(fnscale = -1, # switch to maximisation
                                 temp = 10, # algorithm tuning parameters
                                 tmax = 10, # algorithm tuning parameters
                                 maxit = 1000), # maximum iterations
                  hessian = TRUE)
  m_calib_res_sse[j, ] <- c(fit_sa$par, fit_sa$value)
  res_sse[j] <- list(summ_optim(.best = fit_sa$value, .iter = j,
                                .par = fit_sa$par, 
                                .hessian = fit_sa$hessian))
}

res_all <- append(res_all,
                  list('SA_sse' = res_sse %>% 
                         map_dfr(.f = ~ .x) %>% 
                         pivot_wider(id_cols = c(Iteration, Best), 
                                     names_from = Params, 
                                     values_from = c(Estimate, Lower,
                                                     Upper))))
# Calculate computation time:
comp_time <- Sys.time() - t_init
comp_time

Exploring best-fitting input sets:

## Sort in descending order for LLK and ascending for SSE:   
m_calib_res_llk <- m_calib_res_llk %>% 
  as_tibble() %>% 
  arrange(desc(Overall_fit))
m_calib_res_sse <- m_calib_res_sse %>% 
  as_tibble() %>% 
  arrange(desc(Overall_fit))

## Examine the top 10 best-fitting sets:
m_calib_res_llk[1:10,]
m_calib_res_sse[1:10,]

# Plot the top 100 (top 10%):
ggplot() +
  geom_point(data = m_calib_res_sse[1:100,],
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "SSE parameters")

ggplot() +
  geom_point(data = m_calib_res_llk[1:100,],
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "Likelihood parameters")

# Pairwise comparison of top 100 sets:
pairs.panels(m_calib_res_llk[1:100, v_params_names])
pairs.panels(m_calib_res_sse[1:100, v_params_names])

# Plot model-predicted output at best set vs targets:
v_out_best_llk <- CRS_markov(m_calib_res_llk[1,])
v_out_best_sse <- CRS_markov(m_calib_res_sse[1,])

targets_pt +
  geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>%
               mutate('SSE' = v_out_best_sse$Surv,
                      'time' = cbind(lst_targets$Surv$time)) %>% 
               pivot_longer(cols = -time, names_to = 'GOF', 
                            values_to = 'Survival'),
             aes(x = time, y = Survival, color = GOF, shape = GOF), 
             alpha = 0.5)

Genetic algorithms (GA):

Run the model across initial parameter sets:

res_llk = list()
res_sse = list()
set.seed(seed_no)

## Record start time of calibration:
t_init <- Sys.time()

## Run Genetic Algorithm for each starting point:
m_calib_res_llk <- m_calib_res_sse <- 
  matrix(nrow = 1, ncol = n_params + 1)
colnames(m_calib_res_llk) <- colnames(m_calib_res_sse) <- 
  c(v_params_names, "Overall_fit")

## Define fitness functions to return minimis-able values in DEoptim:
fitness_llk = function(.params) { # Log Likelihood (LLK)
  names(.params) <- v_params_names
  return(-1 * f_gof_llk(.params)) 
}

fitness_sse = function(.params) { # Sum of Squared Errors (SSE)
  names(.params) <- v_params_names
  return(-1 * f_gof_sse(.params)) 
}
## Use Genetic Algorithm through the DEoptim package:
fit_ga <- DEoptim(fitness_llk, # LLK
                  lower = lb,
                  upper = ub,
                  control = DEoptim.control(trace = FALSE))
m_calib_res_llk[1, ] <- c(fit_ga$optim$bestmem, -fit_ga$optim$bestval)
res_llk <- list(summ_optim(.best = -fit_ga$optim$bestval,
                           .par = fit_ga$optim$bestmem,
                           .hessian = NULL, .func = fitness_llk))

res_all <- append(res_all,
                  list('GA_llk' = res_llk %>%
                         map_dfr(.f = ~ .x) %>%
                         pivot_wider(id_cols = c(Iteration, Best),
                                     names_from = Params,
                                     values_from = c(Estimate, Lower,
                                                     Upper))))

fit_ga <- DEoptim(fitness_sse, # SSE
                  lower = lb,
                  upper = ub,
                  control = DEoptim.control(trace = FALSE))
m_calib_res_sse[1, ] <- c(fit_ga$optim$bestmem, -fit_ga$optim$bestval)
res_sse <- list(summ_optim(.best = -fit_ga$optim$bestval,
                           .par = fit_ga$optim$bestmem,
                           .hessian = NULL, .func = fitness_llk))

res_all <- append(res_all,
                  list('GA_sse' = res_sse %>%
                         map_dfr(.f = ~ .x) %>%
                         pivot_wider(id_cols = c(Iteration, Best),
                                     names_from = Params,
                                     values_from = c(Estimate, Lower,
                                                     Upper))))

# Calculate computation time:
comp_time <- Sys.time() - t_init
comp_time

Exploring best-fitting input sets:

## Give the outputs appropriate names:
m_calib_res_llk <- m_calib_res_llk %>% 
  as_tibble()
m_calib_res_sse <- m_calib_res_sse %>% 
  as_tibble()

## Examine the best-fitting set:
m_calib_res_llk
m_calib_res_sse

# Plot the top 100 (top 10%):
ggplot() +
  geom_point(data = m_calib_res_sse,
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "SSE parameters")

ggplot() +
  geom_point(data = m_calib_res_llk,
             aes(x = p_Mets,
                 y = p_DieMets)) +
  theme(
    panel.border = element_rect(fill = NA, colour = 'black')
  ) +
  labs(title = "Likelihood parameters")

# Pairwise comparison of top 100 sets:
pairs.panels(m_calib_res_llk[, 1:2])
pairs.panels(m_calib_res_sse[, 1:2])

# Plot model-predicted output at best set vs targets:
v_out_best_llk <- CRS_markov(m_calib_res_llk[1,])
v_out_best_sse <- CRS_markov(m_calib_res_sse[1,])

targets_pt +
  geom_point(data = tibble('Likelihood' = v_out_best_llk$Surv) %>%
               mutate('SSE' = v_out_best_sse$Surv,
                      'time' = cbind(lst_targets$Surv$time)) %>% 
               pivot_longer(cols = -time, names_to = 'GOF', 
                            values_to = 'Survival'),
             aes(x = time, y = Survival, color = GOF, shape = GOF), 
             alpha = 0.5)


W-Mohammed/calibrater documentation built on Oct. 14, 2023, 1:57 a.m.