R/validation.R
In epicR: Evaluation Platform in Chronic Obstructive Pulmonary Disease

Documented in sanity_check sanity_COPD test_case_detection validate_COPD validate_diagnosis validate_exacerbation validate_gpvisits validate_lung_function validate_medication validate_mortality validate_overdiagnosis validate_payoffs validate_population validate_smoking validate_survival validate_symptoms validate_treatment

report_mode <- 1
# If 1, we are generating a report!

petoc <- function() {
  if (report_mode == 0) {
    message("Press [Enter] to continue")
    r <- readline()
    if (r == "q") {
      terminate_session()
      stop("User asked for termination.\n")
    }
  }
}


#' Basic tests of model functionality. Serious issues if the test does not pass.
#' @return tests results
#' @export
sanity_check <- function() {
  init_session()

  message("test 1: zero all costs\n")
  input <- model_input$values
  for (el in get_list_elements(input$cost)) input$cost[[el]] <- input$cost[[el]] * 0
  input$medication$medication_costs <- 0 * input$medication$medication_costs
  res <- run(100, input = input)
  if (get_output()$total_cost != 0)
    message("Test failed!") else message("Test passed!")
  terminate_session()


  message("test 2: zero all utilities\n")
  init_session()
  input <- model_input$values
  for (el in get_list_elements(input$utility)) input$utility[[el]] <- input$utility[[el]] * 0
  input$medication$medication_utility <- input$medication$medication_utility*0
  input$global_parameters$discount_qaly <- input$global_parameters$discount_qaly*0
  res <- run(100, input = input)
  if (get_output()$total_qaly != 0) {
    message("Test failed!")
    message(get_output()$total_qaly)} else message("Test passed!")
  terminate_session()

  message("test 3: set all utilities to 1 and get one QALY without discount\n")
  init_session()
  input <- model_input$values
  input$global_parameters$discount_qaly <- 0
  input$medication$medication_utility <- input$medication$medication_utility*0
  for (el in get_list_elements(input$utility)) input$utility[[el]] <- input$utility[[el]] * 0 + 1
  input$utility$exac_dutil = input$utility$exac_dutil * 0
  res <- run(100, input = input)
  if (get_output()$total_qaly/get_output()$cumul_time != 1)
    message("Test failed!") else message("Test passed!")
  terminate_session()

  message("test 4: zero mortality (both bg and exac)\n")
  init_session()
  input <- model_input$values
  input$exacerbation$logit_p_death_by_sex <- input$exacerbation$logit_p_death_by_sex * 0 - 10000000  # log scale
  input$agent$p_bgd_by_sex <- input$agent$p_bgd_by_sex * 0
  input$manual$explicit_mortality_by_age_sex <- input$manual$explicit_mortality_by_age_sex * 0
  res <- run(100, input = input)
  if (get_output()$n_deaths != 0) {
    message (get_output()$n_deaths)
    stop("Test failed!")
  } else message("Test passed!")
  terminate_session()
  return(0)
}







#' Returns simulated vs. predicted population table and a plot
#' @param incidence_k a number (default=1) by which the incidence rate of population will be multiplied.
#' @param remove_COPD 0 or 1, indicating whether COPD-caused mortality should be removed
#' @param savePlots 0 or 1, exports 300 DPI population growth and pyramid plots comparing simulated vs. predicted population
#' @param jurisdiction character string specifying the jurisdiction for validation ("canada" or "us"). Default is "canada".
#' @return For Canada: returns a table showing predicted (StatsCan) and simulated population values. For US: returns a data frame with population comparisons by age group and year.
#' @export
validate_population <- function(remove_COPD = 0, incidence_k = 1, savePlots = 0, jurisdiction = "canada") {
  message("Validate_population(...) is responsible for producing output that can be used to test if the population module is properly calibrated.\n")

  # Handle US jurisdiction
  if (tolower(jurisdiction) == "us") {
    USSimulation <- readr::read_csv(system.file("USCensus.csv", package = "epicR"))

    # Setup and run simulation
    settings <- get_default_settings()
    settings$record_mode <- 0
    settings$n_base_agents <- 1e+05 # Reduced from 1e6 for testing speed

    time_horizon <- 56
    results <- simulate(settings = settings, jurisdiction = "us", time_horizon = time_horizon, extended_results = TRUE)
    output <- results$extended

    # Create data frame
    epic_popsize_age <- data.frame(year = seq(2015, by = 1, length.out = time_horizon),
                                   output$n_alive_by_ctime_age)

    # Rename columns
    # Logic from validation.R: Columns 2 to N are renamed 1 to N-1
    colnames(epic_popsize_age)[2:ncol(epic_popsize_age)] <- 1:(ncol(epic_popsize_age) - 1)

    # Remove columns 2 to 40 (ages < 40)
    epic_popsize_age <- epic_popsize_age[, -(2:40)]

    # Reshape (Long format)
    epic_popsize_age_long <- tidyr::pivot_longer(epic_popsize_age,
                                                 cols = -1,
                                                 names_to = "age",
                                                 values_to = "EPIC_popsize")
    # Convert age to integer
    epic_popsize_age_long$age <- as.integer(epic_popsize_age_long$age)

    # Join/Merge Data
    colnames(USSimulation)[colnames(USSimulation) == "value"] <- "US_popsize"

    validate_pop_size_scaled <- merge(USSimulation, epic_popsize_age_long,
                                      by = c("year", "age"),
                                      all.x = TRUE)

    # Initialize Scaled Column
    validate_pop_size_scaled$EPIC_output_scaled <- NA
    validate_pop_size_scaled$EPIC_output_scaled[validate_pop_size_scaled$year == 2015] <-
      validate_pop_size_scaled$US_popsize[validate_pop_size_scaled$year == 2015]

    # Calculate Growth Rates
    total_epic_by_year <- aggregate(
      x = validate_pop_size_scaled["EPIC_popsize"],
      by = validate_pop_size_scaled["year"],
      FUN = sum,
      na.rm = TRUE
    )

    colnames(total_epic_by_year)[2] <- "total_EPIC_output"

    # Sort by year
    total_epic_by_year <- total_epic_by_year[order(total_epic_by_year$year), ]

    # Calculate growth rate (current / previous)
    prev_vals <- c(NA, total_epic_by_year$total_EPIC_output[-nrow(total_epic_by_year)])
    total_epic_by_year$growth_rate <- total_epic_by_year$total_EPIC_output / prev_vals

    # Merge growth rates back
    df_with_growth <- merge(validate_pop_size_scaled,
                            total_epic_by_year[, c("year", "growth_rate")],
                            by = "year",
                            all.x = TRUE)

    # Sort for calculations
    df_with_growth <- df_with_growth[order(df_with_growth$age, df_with_growth$year), ]

    # Apply Growth Projection
    df_split <- split(df_with_growth, df_with_growth$age)

    df_split <- lapply(df_split, function(sub_df) {
      # Ensure sorted by year
      sub_df <- sub_df[order(sub_df$year), ]
      rates <- sub_df$growth_rate
      rates[is.na(rates)] <- 1
      # Get baseline (2015) US size
      baseline <- sub_df$US_popsize[sub_df$year == 2015]
      if(length(baseline) == 0) baseline <- 0
      else baseline <- baseline[1]

      # Calculate Projection
      sub_df$EPIC_output_scaled <- baseline * cumprod(rates)

      return(sub_df)
    })

    # Recombine
    df_with_growth <- do.call(rbind, df_split)

    # Create Age Groups
    df_with_growth$age_group <- NA
    df_with_growth$age_group[df_with_growth$age >= 40 & df_with_growth$age <= 59] <- "40-59"
    df_with_growth$age_group[df_with_growth$age >= 60 & df_with_growth$age <= 79] <- "60-79"
    df_with_growth$age_group[df_with_growth$age >= 80] <- "80+"

    # Aggregate Final Data
    df_summed_ranges <- aggregate(
      x = df_with_growth[c("EPIC_output_scaled", "US_popsize")],
      by = df_with_growth[c("year", "age_group")],
      FUN = sum,
      na.rm = TRUE
    )

    colnames(df_summed_ranges)[3:4] <- c("total_EPIC_population", "total_US_population")

    # Calculate RMSE
    rmse_results <- by(df_summed_ranges, df_summed_ranges$age_group, function(sub) {
      sqrt(mean((sub$total_EPIC_population - sub$total_US_population)^2, na.rm = TRUE))
    })

    rmse_per_range <- data.frame(age_group = names(rmse_results),
                                 rmse = as.numeric(rmse_results))

    message(paste(capture.output(rmse_per_range), collapse = "\n"))

    # Plotting Loop
    unique_groups <- unique(df_summed_ranges$age_group)

    for(age_grp in unique_groups) {
      # Plot
      df_subset <- df_summed_ranges[df_summed_ranges$year <= 2050 & df_summed_ranges$age_group == age_grp, ]
      df_plot <- tidyr::gather(df_subset,
                               key = "Population_Type",
                               value = "Population",
                               "total_EPIC_population", "total_US_population")

      p <- ggplot2::ggplot(
        df_plot,
        ggplot2::aes(x = .data$year,
                     y = .data$Population,
                     color = .data$Population_Type)
      ) +
        ggplot2::geom_line(linewidth = 1.2) +
        ggplot2::geom_point(size = 2) +
        ggthemes::theme_tufte(base_size = 14, ticks = FALSE) +
        ggplot2::ggtitle(paste("Comparison of EPIC vs. US Population Over Time for Age Group", age_grp)) +
        ggplot2::scale_y_continuous(name = "Population", labels = scales::comma) +
        ggplot2::scale_x_continuous(name = "Year", breaks = seq(min(df_plot$year), max(df_plot$year), by = 2)) +
        ggplot2::expand_limits(y = 0) +
        ggplot2::theme(
          legend.title = ggplot2::element_blank(),
          legend.position = "bottom",
          plot.background = ggplot2::element_rect(fill = "white", color = NA),
          panel.background = ggplot2::element_rect(fill = "white", color = NA)
        )

      print(p)
    }

    # RETURN FOR TESTING
    return(df_summed_ranges)
  }

  # Check for unsupported jurisdictions
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' and 'us' are supported.")
  }

  # Canada implementation (existing code)
  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- 1e+06
  settings$event_stack_size <- 1
  init_session(settings = settings)
  input <- model_input$values  #We can work with local copy more conveniently and submit it to the Run function

  message("\nBecause you have called me with remove_COPD=", remove_COPD, ", I am", c("NOT", "indeed")[remove_COPD + 1], "going to remove COPD-related mortality from my calculations")
  petoc()

  # CanSim.052.0005<-read.csv(system.file ('extdata', 'CanSim.052.0005.csv', package = 'epicR'), header = TRUE); #package ready
  # reading
  x <- aggregate(CanSim.052.0005[, "value"], by = list(CanSim.052.0005[, "year"]), FUN = sum)
  x[, 2] <- x[, 2]/x[1, 2]
  x <- x[1:input$global_parameters$time_horizon, ]
  plot(x, type = "l", ylim = c(0.5, max(x[, 2] * 1.5)), xlab = "Year", ylab = "Relative population size")
  title(cex.main = 0.5, "Relative populaton size")
  message("The plot I just drew is the expected (well, StatCan's predictions) relative population growth from 2015\n")
  petoc()

  if (remove_COPD) {
    input$exacerbation$logit_p_death_by_sex <- -1000 + input$exacerbation$logit_p_death_by_sex
    input$manual$explicit_mortality_by_age_sex <- 0
  }

  input$agent$l_inc_betas[1] <- input$agent$l_inc_betas[1] + log(incidence_k)

  message("working...\n")
  res <- run(input = input)
  if (res < 0) {
    stop("Something went awry; bye!")
    return()
  }

  n_y1_agents <- sum(get_output_ex()$n_alive_by_ctime_sex[1, ])
  legend("topright", c("Predicted", "Simulated"), lty = c(1, 1), col = c("black", "red"))

  message("And the black one is the observed (simulated) growth\n")
   ######## pretty population growth curve

  CanSim <- tibble::as_tibble(CanSim.052.0005)
  CanSim <- tidyr::spread(CanSim, key = year, value = value)
  CanSim <- CanSim[, 3:51]
  CanSim <- colSums (CanSim)

  df <- data.frame(Year = c(2015:(2015 + model_input$values$global_parameters$time_horizon-1)), Predicted = CanSim[1:model_input$values$global_parameters$time_horizon] * 1000, Simulated = rowSums(get_output_ex()$n_alive_by_ctime_sex)/ settings$n_base_agents * 18179400) #rescaling population. There are about 18.6 million Canadians above 40
  message ("Here's simulated vs. predicted population table:")
  message(paste(capture.output(df), collapse = "\n"))
  dfm <- reshape2::melt(df[,c('Year','Predicted','Simulated')], id.vars = 1)
  plot_population_growth  <- ggplot2::ggplot(dfm, aes(x = Year, y = value)) +  theme_tufte(base_size=14, ticks=FALSE) +
    geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
    labs(title = "Population Growth Curve") + ylab ("Population") +
    labs(caption = "(based on population at age 40 and above)") +
    theme(legend.title=element_blank()) +
    scale_y_continuous(name="Population", labels = scales::comma)

  plot (plot_population_growth)
  if (savePlots) ggsave(file.path(tempdir(), paste0("PopulationGrowth",".tiff")), plot = last_plot(), device = "tiff", dpi = 300)


  pyramid <- matrix(NA, nrow = input$global_parameters$time_horizon, ncol = length(get_output_ex()$n_alive_by_ctime_age[1, ]) -
                      input$global_parameters$age0)

  for (year in 0:model_input$values$global_parameters$time_horizon - 1) pyramid[1 + year, ] <- get_output_ex()$n_alive_by_ctime_age[year +1, -(1:input$global_parameters$age0)]


  message("Also, the ratio of the expected to observed population in years 10 and 20 are ", sum(get_output_ex()$n_alive_by_ctime_sex[10,
                                                                                                                                  ])/x[10, 2], " and ", sum(get_output_ex()$n_alive_by_ctime_sex[20, ])/x[20, 2])
  petoc()

  message("Now evaluating the population pyramid\n")
  for (year in c(2015, 2025, 2034)) {
    message("The observed population pyramid in", year, "is just drawn\n")
    x <- CanSim.052.0005[which(CanSim.052.0005[, "year"] == year & CanSim.052.0005[, "sex"] == "both"), "value"]
    #x <- c(x, rep(0, 111 - length(x) - 40))
    #barplot(x,  names.arg=40:110, xlab = "Age")
    #title(cex.main = 0.5, paste("Predicted Pyramid - ", year))
    dfPredicted <- data.frame (population = x * 1000, age = 40:100)


    # message("Predicted average age of those >40 y/o is", sum((input$global_parameters$age0:(input$global_parameters$age0 + length(x) -
    #                                                                                       1)) * x)/sum(x), "\n")
    # petoc()
    #
    # message("Simulated average age of those >40 y/o is", sum((input$global_parameters$age0:(input$global_parameters$age0 + length(x) -
    #                                                                                       1)) * x)/sum(x), "\n")
    # petoc()

    dfSimulated <- data.frame (population = pyramid[year - 2015 + 1, ], age = 40:110)
    dfSimulated$population <- dfSimulated$population * (-1) / settings$n_base_agents * 18179400 #rescaling population. There are 18179400 Canadians above 40

    p <- ggplot (NULL, aes(x = age, y = population)) + theme_tufte(base_size=14, ticks=FALSE) +
         geom_bar (aes(fill = "Simulated"), data = dfSimulated, stat="identity", alpha = 0.5) +
         geom_bar (aes(fill = "Predicted"), data = dfPredicted, stat="identity", alpha = 0.5) +
         theme(axis.title=element_blank()) +
         ggtitle(paste0("Simulated vs. Predicted Population Pyramid in ", year)) +
         theme(legend.title=element_blank()) +
         scale_y_continuous(name="Population", labels = scales::comma) +
         scale_x_continuous(name="Age", labels = scales::comma)
    if (savePlots) ggsave(file.path(tempdir(), paste0("Population ", year,".tiff")), plot = last_plot(), device = "tiff", dpi = 300)

    plot(p)

  }
  terminate_session()
}





#' Returns results of validation tests for smoking module.
#'
#' It compares simulated smoking prevalence and trends against observed data to
#' assess the model's accuracy.
#' @param intercept_k a number
#' @param remove_COPD 0 or 1. whether to remove COPD-related mortality.
#' @param jurisdiction character string specifying the jurisdiction for validation ("canada" or "us"). Default is "canada".
#' @return For Canada: validation test results (invisible). For US: data frame with smoking status rates by year.
#' @export
validate_smoking <- function(remove_COPD = 1, intercept_k = NULL, jurisdiction = "canada") {

  # Handle US jurisdiction
  if (tolower(jurisdiction) == "us") {
    message("Welcome to EPIC validator! Today we will see if the model make good smoking predictions for the US population")

    settings <- get_default_settings()
    settings$record_mode <- record_mode["record_mode_event"]
    settings$n_base_agents <- 1e+04
    settings$event_stack_size <- settings$n_base_agents * 1.7 * 30

    # Prepare custom input modifications
    input_modifications <- list()

    message("\nBecause you have called me with remove_COPD=", remove_COPD, ", I am", c("NOT", "indeed")[remove_COPD + 1], "going to remove COPD-related mortality from my calculations")
    if (remove_COPD) {
      input_modifications$exacerbation_logit_p_death_by_sex_modifier <- -10000
    }

    if (!is.null(intercept_k)) {
      input_modifications$manual_smoking_intercept_k <- intercept_k
    }

    message("There are three US validation targets: 1) the prevalence of current, former, and non-smokers in 2018, 2) the prevalence of current smokers in 2023, and 3) the average annual percent change (AAPC).\n")
    message("Starting validation target 1: baseline prevalence of smokers.\n")

    USSmoking2018 <- readr::read_csv(system.file("USSmoking2018.csv", package = "epicR"))
    tab1 <- as.numeric(USSmoking2018$value[1:3]) / 100
    message("This is the observed percentage of current smokers in 2018 (m,f)\n")
    barplot(tab1, names.arg = c("40-64", "65-74", "75+"), ylim = c(0, 0.4), xlab = "Age group", ylab = "Prevalence of smoking",
            col = c("grey"))
    title(cex.main = 0.5, "Prevalence of current smoker by sex and age group (observed)")
    legend("topright", c("Overall"), fill = c("grey"))

    message("Now I will run the model using the default smoking parameters")
    message("running the model\n")

    # Run with jurisdiction and settings
    # Note: Input modifications for remove_COPD and intercept_k would need to be handled differently
    # For now, we'll modify after getting input from simulate
    input <- get_input(jurisdiction = "us")
    if (remove_COPD) {
      input$values$exacerbation$logit_p_death_by_sex <- input$values$exacerbation$logit_p_death_by_sex * -10000
    }
    if (!is.null(intercept_k)) {
      input$values$manual$smoking$intercept_k <- intercept_k
    }

    # Run with modified input - pass jurisdiction to ensure consistency
    results <- simulate(input = input$values, settings = settings, jurisdiction = "us", extended_results = TRUE, return_events = TRUE)

    dataS <- as.matrix(results$events)
    dataS <- dataS[which(dataS[, "event"] == events["event_start"]), ]
    age_list <- list(a1 = c(40, 64), a2 = c(65, 74), a3 = c(75, 111))
    tab2 <- numeric(length(age_list))

    for (j in 1:length(age_list)) {
      tab2[j] <- mean(dataS[
        dataS[, "age_at_creation"] > age_list[[j]][1] &
          dataS[, "age_at_creation"] <= age_list[[j]][2],
        "smoking_status"
      ])
    }

    message("This is the model generated bar plot")
    barplot(tab2, names.arg = c("40-64", "65-74", "75+"), ylim = c(0, 0.4), xlab = "Age group", ylab = "Prevalence of smoking",
            col = c("black"))
    title(cex.main = 0.5, "Prevalence of current smoking at creation (simulated)")
    legend("topright", c("Overall"), fill = c("black"))

    message("Now we will validate the model on smoking trends")
    message("To model the decline in smoking among adults aged 40 and over beyond 2023, historical trends were analyzed using the 2025 MMWR report (DOI: 10.15585/mmwr.mm7407a3). An Annual Average Percent Change (AAPC) of -1.9% was derived from the observed 2017-2023 decrease in smokers aged 45 and over and applied to future projections\n")

    op_ex <- results$extended
    smoker_prev <- op_ex$n_current_smoker_by_ctime_sex/op_ex$n_alive_by_ctime_sex
    smoker_packyears <- op_ex$sum_pack_years_by_ctime_sex/op_ex$n_alive_by_ctime_sex

    plot(2015:(2015+input$values$global_parameters$time_horizon-1), smoker_prev[, 1], type = "l", ylim = c(0, 0.25), col = "black", xlab = "Year", ylab = "Prevalence of current smoking")
    lines(2015:(2015+input$values$global_parameters$time_horizon-1), smoker_prev[, 2], type = "l", col = "grey")
    legend("topright", c("male", "female"), lty = c(1, 1), col = c("black", "grey"))
    title(cex.main = 0.5, "Annual prevalence of current smoking (simulated)")

    plot(2015:(2015+input$values$global_parameters$time_horizon-1), smoker_packyears[, 1], type = "l", ylim = c(0, 30), col = "black", xlab = "Year", ylab = "Average Pack years")
    lines(2015:(2015+input$values$global_parameters$time_horizon-1), smoker_packyears[, 2], type = "l", col = "grey")
    legend("topright", c("male", "female"), lty = c(1, 1), col = c("black", "grey"))
    title(cex.main = 0.5, "Average Pack-Years Per Year for 40+ Population (simulated)")

    z <- log(rowSums(smoker_prev))
    message("average decline in % of current_smoking rate is", 1 - exp(mean(c(z[-1], NaN) - z, na.rm = TRUE)))

    #plotting overall distribution of smoking stats over time
    smoking_status_ctime <- matrix (NA, nrow = input$values$global_parameters$time_horizon, ncol = 4)
    colnames(smoking_status_ctime) <- c("Year", "Non-Smoker", "Smoker", "Former smoker")

    smoking_status_ctime[1:(input$values$global_parameters$time_horizon), 1] <- c(2015:(2015 + input$values$global_parameters$time_horizon-1))

    smoking_status_ctime [, 2:4] <- op_ex$n_smoking_status_by_ctime / rowSums(as.data.frame (op_ex$n_alive_by_ctime_sex)) * 100
    df <- as.data.frame(smoking_status_ctime)
    dfm <- reshape2::melt(df[,c("Year", "Non-Smoker", "Smoker", "Former smoker")], id.vars = 1)
    plot_smoking_status_ctime  <- ggplot2::ggplot(dfm, aes(x = Year, y = value, color = variable)) +
      geom_point () + geom_line() + labs(title = "Smoking Status per year") + ylab ("%") +
      scale_colour_manual(values = c("#66CC99", "#CC6666", "#56B4E9")) + scale_y_continuous(breaks = scales::pretty_breaks(n = 12))

    plot(plot_smoking_status_ctime)

    terminate_session()

    # Preparing dataframe of rates for the test expectations
    n_alive <- rowSums(op_ex$n_alive_by_ctime_sex)
    results_df <- data.frame(
      Year = smoking_status_ctime[, "Year"],
      Current = op_ex$n_smoking_status_by_ctime[, 2] / n_alive,
      Former = op_ex$n_smoking_status_by_ctime[, 3] / n_alive,
      NonSmoker = op_ex$n_smoking_status_by_ctime[, 1] / n_alive
    )
    return(results_df)
  }

  # Check for unsupported jurisdictions
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' and 'us' are supported.")
  }

  # Canada implementation (existing code)
  message("Welcome to EPIC validator! Today we will see if the model make good smoking predictions")

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_event"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- 1e+05
  settings$event_stack_size <- settings$n_base_agents * 1.7 * 30

  init_session(settings = settings)
  input <- model_input$values

  message("\nBecause you have called me with remove_COPD=", remove_COPD, ", I am", c("NOT", "indeed")[remove_COPD + 1], "going to remove COPD-related mortality from my calculations")
  if (remove_COPD) {
    input$exacerbation$logit_p_death_by_sex <- input$exacerbation$logit_p_death_by_sex * -10000 # TODO why was this zero? Amin
  }

  if (!is.null(intercept_k))
    input$manual$smoking$intercept_k <- intercept_k

  petoc()

  message("There are two validation targets: 1) the prevalence of current smokers (by sex) in 2015, and 2) the projected decline in smoking rate.\n")
  message("Starting validation target 1: baseline prevalence of smokers.\n")
  petoc()

  # CanSim.105.0501<-read.csv(paste(data_path,'/CanSim.105.0501.csv',sep=''),header=TRUE) Included in the package as internal data
  tab1 <- rbind(CanSim.105.0501[1:3, "value"], CanSim.105.0501[4:6, "value"])/100
  message("This is the observed percentage of current smokers in 2014 (m,f)\n")
  barplot(tab1, beside = TRUE, names.arg = c("40", "52", "65+"), ylim = c(0, 0.4), xlab = "Age group", ylab = "Prevalenc of smoking",
          col = c("black", "grey"))
  title(cex.main = 0.5, "Prevalence of current smoker by sex and age group (observed)")
  legend("topright", c("Male", "Female"), fill = c("black", "grey"))
  petoc()

  message("Now I will run the model using the default smoking parameters")
  petoc()
  message("running the model\n")

  run(input = input)
  dataS <- get_all_events_matrix()
  dataS <- dataS[which(dataS[, "event"] == events["event_start"]), ]
  age_list <- list(a1 = c(35, 45), a2 = c(45, 65), a3 = c(65, 111))
  tab2 <- tab1
  for (i in 0:1) for (j in 1:length(age_list)) tab2[i + 1, j] <- mean(dataS[which(dataS[, "female"] == i & dataS[, "age_at_creation"] >
                                                                                    age_list[[j]][1] & dataS[, "age_at_creation"] <= age_list[[j]][2]), "smoking_status"])

  message("This is the model generated bar plot")
  petoc()
  barplot(tab2, beside = TRUE, names.arg = c("40", "52", "65+"), ylim = c(0, 0.4), xlab = "Age group", ylab = "Prevalence of smoking",
          col = c("black", "grey"))
  title(cex.main = 0.5, "Prevalence of current smoking at creation (simulated)")
  legend("topright", c("Male", "Female"), fill = c("black", "grey"))

  message("This step is over; press enter to continue to step 2")
  petoc()

  message("Now we will validate the model on smoking trends")
  petoc()

  message("According to Table 2.1 of this report (see the extracted data in data folder): http://www.tobaccoreport.ca/2015/TobaccoUseinCanada_2015.pdf, the prevalence of current smoker is declining by around 3.8% per year\n")
  petoc()

  op_ex <- get_output_ex()
  smoker_prev <- op_ex$n_current_smoker_by_ctime_sex/op_ex$n_alive_by_ctime_sex
  smoker_packyears <- op_ex$sum_pack_years_by_ctime_sex/op_ex$n_alive_by_ctime_sex

  plot(2015:(2015+input$global_parameters$time_horizon-1), smoker_prev[, 1], type = "l", ylim = c(0, 0.25), col = "black", xlab = "Year", ylab = "Prevalence of current smoking")
  lines(2015:(2015+input$global_parameters$time_horizon-1), smoker_prev[, 2], type = "l", col = "grey")
  legend("topright", c("male", "female"), lty = c(1, 1), col = c("black", "grey"))
  title(cex.main = 0.5, "Annual prevalence of currrent smoking (simulated)")

  plot(2015:(2015+input$global_parameters$time_horizon-1), smoker_packyears[, 1], type = "l", ylim = c(0, 30), col = "black", xlab = "Year", ylab = "Average Pack years")
  lines(2015:(2015+input$global_parameters$time_horizon-1), smoker_packyears[, 2], type = "l", col = "grey")
  legend("topright", c("male", "female"), lty = c(1, 1), col = c("black", "grey"))
  title(cex.main = 0.5, "Average Pack-Years Per Year for 40+ Population (simulated)")


  z <- log(rowSums(smoker_prev))
  message("average decline in % of current_smoking rate is", 1 - exp(mean(c(z[-1], NaN) - z, na.rm = TRUE)))
  petoc()

  #plotting overall distribution of smoking stats over time
  smoking_status_ctime <- matrix (NA, nrow = input$global_parameters$time_horizon, ncol = 4)
  colnames(smoking_status_ctime) <- c("Year", "Non-Smoker", "Smoker", "Former smoker")
  smoking_status_ctime[1:(input$global_parameters$time_horizon), 1] <- c(2015:(2015 + input$global_parameters$time_horizon-1))
  smoking_status_ctime [, 2:4] <- op_ex$n_smoking_status_by_ctime / rowSums(as.data.frame (op_ex$n_alive_by_ctime_sex)) * 100
  df <- as.data.frame(smoking_status_ctime)
  dfm <- reshape2::melt(df[,c("Year", "Non-Smoker", "Smoker", "Former smoker")], id.vars = 1)
  plot_smoking_status_ctime  <- ggplot2::ggplot(dfm, aes(x = Year, y = value, color = variable)) +
    geom_point () + geom_line() + labs(title = "Smoking Status per year") + ylab ("%") +
    scale_colour_manual(values = c("#66CC99", "#CC6666", "#56B4E9")) + scale_y_continuous(breaks = scales::pretty_breaks(n = 12))



  plot(plot_smoking_status_ctime ) #plot needs to be showing

  # Plotting pack-years over time
  dataS <- as.data.frame (get_all_events_matrix())
  dataS <- subset (dataS, (event == 0 | event == 1 ))
  data_all <- dataS
  dataS <- subset (dataS, pack_years != 0)
  avg_pack_years_ctime <- matrix (NA, nrow = input$global_parameters$time_horizon + 1, ncol = 4)
  colnames(avg_pack_years_ctime) <- c("Year", "Smokers PYs", "Former Smokers PYs", "all")

  avg_pack_years_ctime[1:(input$global_parameters$time_horizon + 1), 1] <- c(2015:(2015 + input$global_parameters$time_horizon))

  for (i in 0:input$global_parameters$time_horizon) {
    smokers <- subset (dataS, (floor(local_time + time_at_creation) == (i)) & smoking_status != 0)
    prev_smokers <- subset (dataS, (floor(local_time + time_at_creation) == (i)) & smoking_status == 0)
    all <- subset (data_all, floor(local_time + time_at_creation) == i)
    avg_pack_years_ctime[i+1, "Smokers PYs"] <- colSums(smokers)[["pack_years"]] / dim (smokers)[1]
    avg_pack_years_ctime[i+1, "Former Smokers PYs"] <- colSums(prev_smokers)[["pack_years"]] / dim (prev_smokers) [1]
    avg_pack_years_ctime[i+1, "all"] <- colSums(all)[["pack_years"]] / dim (all) [1] #includes non-smokers

  }

  df <- as.data.frame(avg_pack_years_ctime)
  dfm <- reshape2::melt(df[,c( "Year", "Smokers PYs", "Former Smokers PYs", "all")], id.vars = 1)
  plot_avg_pack_years_ctime <- ggplot2::ggplot(dfm, aes(x = Year, y = value, color = variable)) +
    geom_point () + geom_line() + labs(title = "Average pack-years per year ") + ylab ("Pack-years")

  plot(plot_avg_pack_years_ctime) #plot needs to be showing

  # Plotting pack-years over age

  avg_pack_years_age <- matrix (NA, nrow = 110 - 40 + 1, ncol = 3)
  colnames(avg_pack_years_age) <- c("Age", "Smokers PYs", "Former Smokers PYs")

  avg_pack_years_age[1:(110 - 40 + 1), 1] <- c(40:110)

  for (i in 0:(110 - 40)) {
    smokers <- subset (dataS, (floor (local_time + age_at_creation) == (i+40)) & smoking_status != 0)
    prev_smokers <- subset (dataS, (floor (local_time + age_at_creation) == (i+40)) & smoking_status == 0)
    avg_pack_years_age[i+1, "Smokers PYs"] <- colSums(smokers)[["pack_years"]] / dim (smokers)[1]
    avg_pack_years_age[i+1, "Former Smokers PYs"] <- colSums(prev_smokers)[["pack_years"]] / dim (prev_smokers) [1]
  }

  df <- as.data.frame(avg_pack_years_age)
  dfm <- reshape2::melt(df[,c( "Age", "Smokers PYs", "Former Smokers PYs")], id.vars = 1)
  plot_avg_pack_years_age <- ggplot2::ggplot(dfm, aes(x = Age, y = value, color = variable, ymin = 40, ymax = 100)) +
    geom_point () + geom_line() + labs(title = "Average pack-years per age ") + ylab ("Pack-years")

  plot(plot_avg_pack_years_age) #plot needs to be showing


  message("This test is over; terminating the session")
  petoc()
  terminate_session()
}




#' Basic COPD test.
#' @return validation test results
#' @export
sanity_COPD <- function() {
  settings <- default_settings

  settings$record_mode <- session_env$record_mode["record_mode_agent"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- 20000
  settings$event_stack_size <- settings$n_base_agents * 10

  init_session(settings = settings)

  # Load input parameters
  inp <- get_input()

  message("COPD incidence and prevalence parameters are as follows\n")

  message("model_input$values$COPD$logit_p_COPD_betas_by_sex:\n")
  message(paste(capture.output(inp$values$COPD$logit_p_COPD_betas_by_sex), collapse = "\n"))
  petoc()
  message("model_input$values$COPD$p_prevalent_COPD_stage:\n")
  message(paste(capture.output(inp$values$COPD$p_prevalent_COPD_stage), collapse = "\n"))
  petoc()
  message("model_input$values$COPD$ln_h_COPD_betas_by_sex:\n")
  message(paste(capture.output(inp$values$COPD$ln_h_COPD_betas_by_sex), collapse = "\n"))
  petoc()

  message("Now I am going to first turn off both prevalence and incidence parameters and run the model to see how many COPDs I get\n")
  petoc()
  input <- inp$values
  input$COPD$logit_p_COPD_betas_by_sex <- input$COPD$logit_p_COPD_betas_by_sex * 0 - 100
  input$COPD$ln_h_COPD_betas_by_sex <- input$COPD$ln_h_COPD_betas_by_sex * 0 - 100
  run(input = input)
  message("The model is reporting it has got this many COPDs cases: ", get_output()$n_COPD, " out of ", get_output()$n_agents, " agents.\n")
  all_events <- as.data.frame(get_all_events_matrix())
  dataS <- all_events[all_events$event == events["event_start"], ]
  message("The prevalence of COPD in Start event dump is: ", mean(dataS[, "gold"] > 0), "\n")
  dataS <- all_events[all_events$event == events["event_end"], ]
  message("The prevalence of COPD in End event dump is: ", mean(dataS[, "gold"] > 0), "\n")
  petoc()

  message("Now I am going to switch off incidence and create COPD patients only through prevalence (using natural prevalence)")
  petoc()
  terminate_session()
  init_session(settings = settings)
  inp <- get_input()
  input <- inp$values
  # Keep prevalence at natural values (don't multiply by 0)
  # Turn off incidence completely
  input$COPD$ln_h_COPD_betas_by_sex <- input$COPD$ln_h_COPD_betas_by_sex * 0 - 100
  run(input = input)
  message("The model is reporting it has got this many COPDs: ", get_output()$n_COPD, " out of ", get_output()$n_agents, "agents.\n")
  all_events <- as.data.frame(get_all_events_matrix())
  dataS <- all_events[all_events$event == events["event_start"], ]
  message("The prevalence of COPD in Start event dump is: ", mean(dataS[, "gold"] > 0), "\n")
  dataS <- all_events[all_events$event == events["event_end"], ]
  message("The prevalence of COPD in End event dump is: ", mean(dataS[, "gold"] > 0), "\n")
  petoc()

  message("Now I am going to switch off prevalence and create COPD patients only through incidence\n")
  petoc()
  terminate_session()
  init_session(settings = settings)
  inp <- get_input()
  input <- inp$values
  # Turn off prevalence completely
  input$COPD$logit_p_COPD_betas_by_sex <- input$COPD$logit_p_COPD_betas_by_sex * 0 - 100
  # Keep incidence at natural values (don't set to negative)
  run(input = input)
  message("The model is reporting it has got this many COPDs: ", get_output()$n_COPD, " out of ", get_output()$n_agents, "agents.\n")
  all_events <- as.data.frame(get_all_events_matrix())
  dataS <- all_events[all_events$event == events["event_start"], ]
  message("The prevalence of COPD in Start event dump is: ", mean(dataS[, "gold"] > 0), "\n")
  dataS <- all_events[all_events$event == events["event_end"], ]
  message("The prevalence of COPD in End event dump is: ", mean(dataS[, "gold"] > 0), "\n")
  petoc()


  terminate_session()
}



#' Returns results of validation tests for COPD
#'
#' This function runs validation tests for COPD model outputs. It estimates the baseline
#' prevalence and incidence of COPD, along with sex-specific baseline COPD prevalence.
#' Additionally, it calculates the baseline prevalence of COPD by age groups and
#' smoking pack-years. It also estimates the coefficients for the relationships between
#' age, pack-years, smoking status, and the prevalence of COPD.
#'
#'
#' @param incident_COPD_k a number (default=1) by which the incidence rate of COPD will be multiplied.
#' @param return_CI if TRUE, returns 95 percent confidence intervals for the "Year" coefficient
#' @param jurisdiction character string specifying the jurisdiction for validation ("canada" or "us"). Default is "canada".
#' @return For Canada: list with validation test results. For US: data frame with COPD prevalence by age group over time.
#' @export
validate_COPD <- function(incident_COPD_k = 1, return_CI = FALSE, jurisdiction = "canada") # The incidence rate is multiplied by K
{
  # Handle US jurisdiction
  if (tolower(jurisdiction) == "us") {
    settings <- get_default_settings()
    settings$record_mode <- 0
    settings$n_base_agents <- 1e6

    time_horizon <- 26
    results <- simulate(settings = settings, jurisdiction = "us", time_horizon = time_horizon, extended_results = TRUE)
    output <- results$extended

    # Determine overall COPD prevalence
    COPDprevalence_ctime_age <- as.data.frame(output$n_COPD_by_ctime_age)
    totalpopulation <- output$n_alive_by_ctime_age

    # Overall prevalence of COPD
    alive_age_all <- rowSums(output$n_alive_by_ctime_age[1:time_horizon, 40:111])
    COPD_age_all <- rowSums(output$n_COPD_by_ctime_age[1:time_horizon, 40:111])
    prevalenceCOPD_age_all <- COPD_age_all / alive_age_all

    # Prevalence by age 40-59
    alive_age_40to59 <- rowSums(output$n_alive_by_ctime_age[1:time_horizon, 40:59])
    COPD_age_40to59 <- rowSums(output$n_COPD_by_ctime_age[1:time_horizon, 40:59])
    prevalenceCOPD_age_40to59 <- COPD_age_40to59 / alive_age_40to59

    # Prevalence by age 60-79
    alive_age_60to79 <- rowSums(output$n_alive_by_ctime_age[1:time_horizon, 60:79])
    COPD_age_60to79 <- rowSums(output$n_COPD_by_ctime_age[1:time_horizon, 60:79])
    prevalenceCOPD_age_60to79 <- COPD_age_60to79 / alive_age_60to79

    # Prevalence by age 80+
    alive_age_over80 <- rowSums(output$n_alive_by_ctime_age[1:time_horizon, 80:111])
    COPD_age_over80 <- rowSums(output$n_COPD_by_ctime_age[1:time_horizon, 80:111])
    prevalenceCOPD_age_over80 <- COPD_age_over80 / alive_age_over80

    # Display summary
    COPD_prevalence_summary <- data.frame(
      Year = 2015:(2015 + time_horizon - 1),
      Prevalence_all = prevalenceCOPD_age_all,
      Prevalence_40to59 = prevalenceCOPD_age_40to59,
      Prevalence_60to79 = prevalenceCOPD_age_60to79,
      Prevalence_over80 = prevalenceCOPD_age_over80
    )

    message(paste(capture.output(knitr::kable(COPD_prevalence_summary,
                       caption = "COPD Prevalence by Age Group Over Time",
                       digits = 3)), collapse = "\n"))

    # Plot: All Ages
    plot_prevalenceCOPD_age_all <- data.frame(
      Year = 2015:(2015 + time_horizon - 1),
      Prevalence = prevalenceCOPD_age_all
    )

    gg_plot_prevalenceCOPD_age_all <- ggplot2::ggplot(plot_prevalenceCOPD_age_all,
                                                      ggplot2::aes(x = .data$Year, y = .data$Prevalence)) +
      ggplot2::geom_line(linewidth = 1.5, color = "#003f5c") +
      ggplot2::geom_point(size = 3, color = "#66c2ff", stroke = 0.8) +
      ggplot2::scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 0.15),
        breaks = seq(0, 0.15, by = 0.05)
      ) +
      ggplot2::scale_x_continuous(breaks = seq(2015, 2040, by = 5)) +
      ggplot2::labs(
        title = "COPD Prevalence Over Time (All Ages)",
        subtitle = "Estimated proportion of population with COPD from 2016-2040",
        x = "Year",
        y = "Prevalence (%)"
      ) +
      ggplot2::theme_minimal(base_size = 14) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(face = "bold", size = 18, hjust = 0.5, margin = ggplot2::margin(b = 8)),
        plot.subtitle = ggplot2::element_text(size = 14, hjust = 0.5),
        axis.title = ggplot2::element_text(face = "bold"),
        axis.text = ggplot2::element_text(color = "black"),
        axis.line = ggplot2::element_line(color = "black", linewidth = 0.8),
        panel.grid.major = ggplot2::element_blank(),
        panel.grid.minor = ggplot2::element_blank()
      )

    print(gg_plot_prevalenceCOPD_age_all)

    # Plot: Age 40-59
    plot_prevalence_40to59 <- data.frame(
      Year = 2015:(2015 + time_horizon - 1),
      Prevalence = prevalenceCOPD_age_40to59
    )

    gg_plot_prevalence_40to59 <- ggplot2::ggplot(plot_prevalence_40to59,
                                                 ggplot2::aes(x = .data$Year, y = .data$Prevalence)) +
      ggplot2::geom_line(linewidth = 1.5, color = "#003f5c") +
      ggplot2::geom_point(size = 3, color = "#66c2ff", stroke = 0.8) +
      ggplot2::scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 0.10),
        breaks = seq(0, 0.10, by = 0.05)) +
      ggplot2::scale_x_continuous(breaks = seq(2015, 2040, by = 5)) +
      ggplot2::labs(
        title = "COPD Prevalence Over Time (Age 40-59)",
        subtitle = "Estimated proportion of population with COPD from 2016-2040",
        x = "Year",
        y = "Prevalence (%)") +
      ggplot2::theme_minimal(base_size = 14) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(face = "bold", size = 18, hjust = 0.5, margin = ggplot2::margin(b = 8)),
        plot.subtitle = ggplot2::element_text(size = 14, hjust = 0.5),
        axis.title = ggplot2::element_text(face = "bold"),
        axis.text = ggplot2::element_text(color = "black"),
        axis.line = ggplot2::element_line(color = "black", linewidth = 0.8),
        panel.grid.major = ggplot2::element_blank(),
        panel.grid.minor = ggplot2::element_blank()
      )

    print(gg_plot_prevalence_40to59)

    # Plot: Age 60-79
    plot_prevalence_60to79 <- data.frame(
      Year = 2015:(2015 + time_horizon - 1),
      Prevalence = prevalenceCOPD_age_60to79
    )

    gg_plot_prevalence_60to79 <- ggplot2::ggplot(plot_prevalence_60to79,
                                                 ggplot2::aes(x = .data$Year, y = .data$Prevalence)) +
      ggplot2::geom_line(linewidth = 1.5, color = "#003f5c") +
      ggplot2::geom_point(size = 3, color = "#66c2ff", stroke = 0.8) +
      ggplot2::scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 0.15),
        breaks = seq(0, 0.15, by = 0.05)
      ) +
      ggplot2::scale_x_continuous(breaks = seq(2015, 2040, by = 5)) +
      ggplot2::labs(
        title = "COPD Prevalence Over Time (Age 60-79)",
        subtitle = "Estimated proportion of population with COPD from 2016-2040",
        x = "Year",
        y = "Prevalence (%)"
      ) +
      ggplot2::theme_minimal(base_size = 14) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(face = "bold", size = 18, hjust = 0.5, margin = ggplot2::margin(b = 8)),
        plot.subtitle = ggplot2::element_text(size = 14, hjust = 0.5),
        axis.title = ggplot2::element_text(face = "bold"),
        axis.text = ggplot2::element_text(color = "black"),
        axis.line = ggplot2::element_line(color = "black", linewidth = 0.8),
        panel.grid.major = ggplot2::element_blank(),
        panel.grid.minor = ggplot2::element_blank()
      )

    print(gg_plot_prevalence_60to79)

    # Plot: Age 80+
    plot_prevalence_over80 <- data.frame(
      Year = 2015:(2015 + time_horizon - 1),
      Prevalence = prevalenceCOPD_age_over80
    )

    gg_plot_prevalence_over80 <- ggplot2::ggplot(plot_prevalence_over80,
                                                 ggplot2::aes(x = .data$Year, y = .data$Prevalence)) +
      ggplot2::geom_line(linewidth = 1.5, color = "#003f5c") +
      ggplot2::geom_point(size = 3, color = "#66c2ff", stroke = 0.8) +
      ggplot2::scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 0.30),
        breaks = seq(0, 0.30, by = 0.05)
      ) +
      ggplot2::scale_x_continuous(breaks = seq(2015, 2040, by = 5)) +
      ggplot2::labs(
        title = "COPD Prevalence Over Time (Age 80+)",
        subtitle = "Estimated proportion of population with COPD from 2016-2040",
        x = "Year",
        y = "Prevalence (%)"
      ) +
      ggplot2::theme_minimal(base_size = 14) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(face = "bold", size = 18, hjust = 0.5, margin = ggplot2::margin(b = 8)),
        plot.subtitle = ggplot2::element_text(size = 14, hjust = 0.5),
        axis.title = ggplot2::element_text(face = "bold"),
        axis.text = ggplot2::element_text(color = "black"),
        axis.line = ggplot2::element_line(color = "black", linewidth = 0.8),
        panel.grid.major = ggplot2::element_blank(),
        panel.grid.minor = ggplot2::element_blank()
      )

    print(gg_plot_prevalence_over80)

    terminate_session()
    return(COPD_prevalence_summary)
  }

  # Check for unsupported jurisdictions
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' and 'us' are supported.")
  }

  # Canada implementation (existing code)
  out <- list()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_event"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- 1e+05
  settings$event_stack_size <- settings$n_base_agents * 50
  init_session(settings = settings)
  input <- model_input$values

  if (incident_COPD_k == 0)
    input$COPD$ln_h_COPD_betas_by_sex <- input$COPD$ln_h_COPD_betas_by_sex * 0 - 100 else input$COPD$ln_h_COPD_betas_by_sex[1, ] <- model_input$values$COPD$ln_h_COPD_betas_by_sex[1, ] + log(incident_COPD_k)

  message("working...\n")
  run(input = input)
  op <- get_output()
  opx <- get_output_ex()
  data <- as.data.frame(get_all_events_matrix())
  dataS <- data[which(data[, "event"] == events["event_start"]), ]
  dataE <- data[which(data[, "event"] == events["event_end"]), ]

  out$p_copd_at_creation <- mean(dataS[, "gold"] > 0)

  new_COPDs <- which(dataS[which(dataE[, "gold"] > 0), "gold"] == 0)

  out$inc_copd <- sum(opx$n_inc_COPD_by_ctime_age)/opx$cumul_non_COPD_time
  out$inc_copd_by_sex <- sum(opx$n_inc_COPD_by_ctime_age)/opx$cumul_non_COPD_time

  x <- sqldf::sqldf("SELECT female, SUM(gold>0) AS n_copd, COUNT(*) AS n FROM dataS GROUP BY female")
  out$p_copd_at_creation_by_sex <- x[, "n_copd"]/x[, "n"]



  age_cats <- c(40, 50, 60, 70, 80, 111)
  dataS[, "age_cat"] <- as.numeric(cut(dataS[, "age_at_creation"] + dataS[, "local_time"], age_cats, include.lowest = TRUE))
  x <- sqldf::sqldf("SELECT age_cat, SUM(gold>0) AS n_copd, COUNT(*) AS n FROM dataS GROUP BY age_cat")
  temp <- x[, "n_copd"]/x[, "n"]
  names(temp) <- paste(age_cats[-length(age_cats)], age_cats[-1], sep = "-")
  out$p_copd_at_creation_by_age <- temp


  py_cats <- c(0, 15, 30, 45, Inf)
  dataS[, "py_cat"] <- as.numeric(cut(dataS[, "pack_years"], py_cats, include.lowest = TRUE))
  x <- sqldf::sqldf("SELECT py_cat, SUM(gold>0) AS n_copd, COUNT(*) AS n FROM dataS GROUP BY py_cat")
  temp <- x[, "n_copd"]/x[, "n"]
  names(temp) <- paste(py_cats[-length(py_cats)], py_cats[-1], sep = "-")
  out$p_copd_at_creation_by_pack_years <- temp


  dataF <- data[which(data[, "event"] == events["event_fixed"]), ]
  dataF[, "age"] <- dataF[, "local_time"] + dataF[, "age_at_creation"]
  dataF[, "copd"] <- (dataF[, "gold"] > 0) * 1
  dataF[, "gold2p"] <- (dataF[, "gold"] > 1) * 1
  dataF[, "gold3p"] <- (dataF[, "gold"] > 2) * 1
  dataF[, "year"] <- dataF[, "local_time"] + dataF[, "time_at_creation"]

    res <- glm(data = dataF[which(dataF[, "female"] == 0), ], formula = copd ~ age + pack_years + smoking_status + year, family = binomial(link = logit))
    out$calib_prev_copd_reg_coeffs_male <- coefficients(res)
    if (return_CI) {out$conf_prev_copd_reg_coeffs_male <- stats::confint(res, "year", level = 0.95)}

    res <- glm(data = dataF[which(dataF[, "female"] == 1), ], formula = copd ~ age + pack_years + smoking_status + year, family = binomial(link = logit))
    out$calib_prev_copd_reg_coeffs_female <- coefficients(res)
    if (return_CI) {out$conf_prev_copd_reg_coeffs_female <- stats::confint(res, "year", level = 0.95)}

    res <- glm(data = dataF[which(dataF[, "female"] == 0), ], formula = gold2p ~ age + pack_years + smoking_status + year, family = binomial(link = logit))
    out$calib_prev_gold2p_reg_coeffs_male <- coefficients(res)
    if (return_CI) {out$conf_prev_gold2p_reg_coeffs_male <- stats::confint(res, "year", level = 0.95)}

    res <- glm(data = dataF[which(dataF[, "female"] == 1), ], formula = gold2p ~ age + pack_years + smoking_status + year, family = binomial(link = logit))
    out$calib_prev_gold2p_reg_coeffs_female <- coefficients(res)
    if (return_CI) {out$conf_prev_gold2p_reg_coeffs_female <- stats::confint(res, "year", level = 0.95)}

    res <- glm(data = dataF[which(dataF[, "female"] == 0), ], formula = gold3p ~ age + pack_years + smoking_status + year, family = binomial(link = logit))
    out$calib_prev_gold3p_reg_coeffs_male <- coefficients(res)
    if (return_CI) {out$conf_prev_gold3p_reg_coeffs_male <- stats::confint(res, "year", level = 0.95)}

    res <- glm(data = dataF[which(dataF[, "female"] == 1), ], formula = gold3p ~ age + pack_years + smoking_status + year, family = binomial(link = logit))
    out$calib_prev_gold3p_reg_coeffs_female <- coefficients(res)
    if (return_CI) {out$conf_prev_gold3p_reg_coeffs_female <- stats::confint(res, "year", level = 0.95)}


  terminate_session()

  return(out)
}




#' Returns results of validation tests for payoffs, costs and QALYs
#' @param nPatient number of simulated patients. Default is 1e6.
#' @param disableDiscounting if TRUE, discounting will be disabled for cost and QALY calculations. Default: TRUE
#' @param disableExacMortality if TRUE, mortality due to exacerbations will be disabled for cost and QALY calculations. Default: TRUE
#' @param jurisdiction character string specifying the jurisdiction for validation ("canada" or "us"). Default is "canada". Currently only "canada" is implemented.
#' @return validation test results
#' @export
validate_payoffs <- function(nPatient = 1e6, disableDiscounting = TRUE, disableExacMortality = TRUE, jurisdiction = "canada")
{
  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  out <- list()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- nPatient
  settings$event_stack_size <- 0
  init_session(settings = settings)
  input <- model_input$values

  if (disableDiscounting)  {
    input$global_parameters$discount_cost <- 0
    input$global_parameters$discount_qaly <- 0
  }

  if (disableExacMortality) {
    input$exacerbation$logit_p_death_by_sex <- -1000 + 0*input$exacerbation$logit_p_death_by_sex
  }

  run(input = input)
  op <- get_output()
  op_ex <- get_output_ex()

  exac_dutil<-get_inputs()$utility$exac_dutil
  exac_dcost<-get_inputs()$cost$exac_dcost


  total_qaly<-colSums(op_ex$cumul_qaly_gold_ctime)[2:5]
  qaly_loss_dueto_exac_by_gold<-rowSums(op_ex$n_exac_by_gold_severity*exac_dutil)
  back_calculated_utilities<-(total_qaly-qaly_loss_dueto_exac_by_gold)/colSums(op_ex$cumul_time_by_ctime_GOLD)[2:5]
  #I=0.81,II=0.72,III=0.68,IV=0.58)))

  out$cumul_time_per_GOLD <- colSums(op_ex$cumul_time_by_ctime_GOLD)[2:5]
  out$total_qaly <- total_qaly
  out$qaly_loss_dueto_exac_by_gold <-  qaly_loss_dueto_exac_by_gold
  out$back_calculated_utilities <- back_calculated_utilities
  out$utility_target_values <- input$utility$bg_util_by_stage
  out$utility_difference_percentage <- (out$back_calculated_utilities - out$utility_target_values[2:5]) / out$utility_target_values[2:5] * 100

  total_cost<-colSums(op_ex$cumul_cost_gold_ctime)[2:5]
  cost_dueto_exac_by_gold<-rowSums(t((exac_dcost)*t(op_ex$n_exac_by_gold_severity)))
  back_calculated_costs<-(total_cost-cost_dueto_exac_by_gold)/colSums(op_ex$cumul_time_by_ctime_GOLD)[2:5]
  #I=615, II=1831, III=2619, IV=3021

  out$total_cost <- total_cost
  out$cost_dueto_exac_by_gold <- cost_dueto_exac_by_gold
  out$back_calculated_costs <- back_calculated_costs
  out$cost_target_values <- input$cost$bg_cost_by_stage
  out$cost_difference_percentage <- (out$back_calculated_costs - out$cost_target_values[2:5]) / out$cost_target_values[2:5] * 100

  terminate_session()

  return(out)
}



#' Returns results of validation tests for mortality rate
#'
#' This function returns a table and a plot of the difference between simulated and expected
#' (life table) mortality, by sex and age.
#' @param n_sim number of simulated agents
#' @param bgd a number
#' @param bgd_h a number
#' @param manual a number
#' @param exacerbation a number
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_mortality <- function(n_sim = 5e+05, bgd = 1, bgd_h = 1, manual = 1, exacerbation = 1, jurisdiction = "canada") {
  message("Hello from EPIC! I am going to test mortality rate and how it is affected by input parameters\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  input$global_parameters$time_horizon <- 1

  input$agent$p_bgd_by_sex <- input$agent$p_bgd_by_sex * bgd

  input$agent$ln_h_bgd_betas <- input$agent$ln_h_bgd_betas * bgd_h

  input$manual$explicit_mortality_by_age_sex <- input$manual$explicit_mortality_by_age_sex * manual

  input$exacerbation$logit_p_death_by_sex <- input$exacerbation$logit_p_death_by_sex * exacerbation

  message("working...\n")
  res <- run(input = input)

  message("Mortality rate was", get_output()$n_death/get_output()$cumul_time, "\n")


  if (get_output()$n_death > 0) {

    ratio<-(get_output_ex()$n_death_by_age_sex[41:111,]/get_output_ex()$sum_time_by_age_sex[41:111,])/model_input$values$agent$p_bgd_by_sex[41:111,]
    plot(40:110,ratio[,1],type='l',col='blue',xlab="age",ylab="Ratio", ylim = c(0, 4))
    legend("topright",c("male","female"),lty=c(1,1),col=c("blue","red"))
    lines(40:110,ratio[,2],type='l',col='red')
    title(cex.main=0.5,"Ratio of simulated to expected (life table) mortality, by sex and age")


    difference <- (get_output_ex()$n_death_by_age_sex[41:91, ]/get_output_ex()$sum_time_by_age_sex[41:91, ]) - model_input$values$agent$p_bgd_by_sex[41:91,
                                                                                                                                                       ]
    plot(40:90, difference[, 1], type = "l", col = "blue", xlab = "age", ylab = "Difference", ylim = c(-.1, .1))
    legend("topright", c("male", "female"), lty = c(1, 1), col = c("blue", "red"))
    lines(40:90, difference[, 2], type = "l", col = "red")
    title(cex.main = 0.5, "Difference between simulated and expected (life table) mortality, by sex and age")

    return(list(difference = difference))
  } else message("No death occured.\n")
}



# validate_comorbidity removed - MI/stroke/HF deprecated


#' Returns results of validation tests for lung function
#'
#' This function evaluates FEV1 (Forced Expiratory Volume in 1 second) values
#' and GOLD stage distributions to assess lung function in simulated patients.
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_lung_function <- function(jurisdiction = "canada") {
  message("This function examines FEV1 values\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_some_event"]
  settings$events_to_record <- events[c("event_start", "event_COPD", "event_fixed")]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- 1e+05
  settings$event_stack_size <- settings$n_base_agents * 100

  init_session(settings = settings)

  input <- model_input$values
  input$global_parameters$discount_qaly <- 0

  run(input = input)

  all_events <- as.data.frame(get_all_events_matrix())

  COPD_events <- which(all_events[, "event"] == events["event_COPD"])
  start_events <- which(all_events[, "event"] == events["event_start"])

  out_FEV1_prev <- sqldf::sqldf(paste("SELECT gold, AVG(FEV1) AS 'Mean', STDEV(FEV1) AS 'SD' FROM all_events WHERE event=", events["event_start"],
                                      " GROUP BY gold"))
  out_FEV1_inc <- sqldf::sqldf(paste("SELECT gold, AVG(FEV1) AS 'Mean', STDEV(FEV1) AS 'SD' FROM all_events WHERE event=", events["event_COPD"],
                                     " GROUP BY gold"))

  out_gold_prev <- sqldf::sqldf(paste("SELECT gold, COUNT(*) AS N FROM all_events WHERE event=", events["event_start"], " GROUP BY gold"))
  out_gold_prev[, "Percent"] <- round(out_gold_prev[, "N"]/sum(out_gold_prev[, "N"]), 3)
  out_gold_inc <- sqldf::sqldf(paste("SELECT gold, COUNT(*) AS N FROM all_events WHERE event=", events["event_COPD"], " GROUP BY gold"))
  out_gold_inc[, "Percent"] <- round(out_gold_inc[, "N"]/sum(out_gold_inc[, "N"]), 3)

  COPD_events_patients <- subset(all_events, event == 4)
  start_events_patients <- subset(all_events, event == 0 & gold > 0)

  table(COPD_events_patients[, "gold"])/sum(table(COPD_events_patients[, "gold"]))

  table(start_events_patients[, "gold"])/sum(table(start_events_patients[, "gold"]))


  out_gold_inc_patients <- table(COPD_events_patients[, "gold"])/sum(table(COPD_events_patients[, "gold"]))

  out_gold_prev_patients <- table(start_events_patients[, "gold"])/sum(table(start_events_patients[, "gold"]))

  COPD_ids <- all_events[COPD_events, "id"]

  for (i in 1:100) {
    y <- which(all_events[, "id"] == COPD_ids[i] & all_events[, "gold"] > 0)
    if (i == 1)
      plot(all_events[y, "local_time"], all_events[y, "FEV1"], type = "l", xlim = c(0, 20), ylim = c(0, 5), xlab = "local time",
           ylab = "FEV1") else lines(all_events[y, "local_time"], all_events[y, "FEV1"], type = "l")
  }
  title(cex.main = 0.5, "Trajectories of FEV1 in 100 individuals")

  return(list(FEV1_prev = out_FEV1_prev, FEV1_inc = out_FEV1_inc, gold_prev = out_gold_prev, gold_inc = out_gold_inc, gold_prev_patients = out_gold_prev_patients,
              gold_inc_patients = out_gold_inc_patients))
}



#' Returns results of validation tests for exacerbation rates
#'
#' This function runs validation tests for COPD exacerbation rates by GOLD stage
#' and compares them with reference values from studies such as CanCOLD, CIHI,
#' and Hoogendoorn. It simulates exacerbation events, and returns key metrics,
#' including overall exacerbation rates, rates by GOLD stage, and rates in
#' diagnosed vs. undiagnosed patients.
#' @param base_agents Number of agents in the simulation. Default is 1e4.
#' @param input EPIC inputs
#' @param jurisdiction character string specifying the jurisdiction for validation ("canada" or "us"). Default is "canada".
#' @return For Canada: validation test results (invisible). For US: invisible NULL (results displayed via messages and plots).
#' @export
validate_exacerbation <- function(base_agents=1e4, input=NULL, jurisdiction = "canada") {

  # Handle US jurisdiction
  if (tolower(jurisdiction) == "us") {
    settings <- get_default_settings()
    settings$record_mode <- record_mode["record_mode_event"]
    settings$n_base_agents <- base_agents

    results <- simulate(settings = settings, jurisdiction = "us", extended_results = TRUE, return_events = TRUE)
    op <- results$basic
    output_ex <- results$extended

    all_events <- results$events
    exac_events <- subset(all_events, event == 5)
    exit_events <- subset(all_events, event == 14)

    Follow_up_GOLD <- c(0, 0, 0, 0)
    last_GOLD_transition_time <- 0
    for (i in 2:dim(all_events)[1]) {
      if (all_events[i, "id"] != all_events[i - 1, "id"])
        last_GOLD_transition_time <- 0
      if ((all_events[i, "id"] == all_events[i - 1, "id"]) & (all_events[i, "gold"] != all_events[i - 1, "gold"])) {
        Follow_up_GOLD[all_events[i - 1, "gold"]] = Follow_up_GOLD[all_events[i - 1, "gold"]] + all_events[i - 1, "followup_after_COPD"] -
          last_GOLD_transition_time
        last_GOLD_transition_time <- all_events[i - 1, "followup_after_COPD"]
      }
      if (all_events[i, "event"] == 14)
        Follow_up_GOLD[all_events[i, "gold"]] = Follow_up_GOLD[all_events[i, "gold"]] + all_events[i, "followup_after_COPD"] -
          last_GOLD_transition_time
    }


    #----------------------------DIAGNOSED ------------------------------------
    #-------------------------------------------------------------------------

    all_events_diagnosed          <- subset(all_events, diagnosis > 0 & gold > 0 )
    exac_events_diagnosed         <- subset(all_events_diagnosed, event == 5 )
    sev_exac_events_diagnosed    <- subset(all_events_diagnosed, event == 5 & (exac_status == 3 | exac_status == 4) )
    mod_sev_exac_events_diagnosed <- subset(all_events_diagnosed, event == 5 & (exac_status == 3 | exac_status == 4 | exac_status == 2) )
    exit_events_diagnosed         <- subset(all_events_diagnosed, event == 14)

    Follow_up_GOLD_diagnosed <- c(0, 0, 0, 0)
    last_GOLD_transition_time_diagnosed <- 0
    for (i in 2:dim(all_events_diagnosed)[1]) {
      if ((all_events_diagnosed[i, "id"] != all_events_diagnosed[i - 1, "id"]))
        last_GOLD_transition_time_diagnosed <- 0
      if ((all_events_diagnosed[i, "id"] == all_events_diagnosed[i - 1, "id"]) & (all_events_diagnosed[i, "gold"] != all_events_diagnosed[i - 1, "gold"])) {
        Follow_up_GOLD_diagnosed[all_events_diagnosed[i - 1, "gold"]] = Follow_up_GOLD_diagnosed[all_events_diagnosed[i - 1, "gold"]] + (all_events_diagnosed[i - 1, "local_time"]-all_events_diagnosed[i - 1, "time_at_diagnosis"]) -
          last_GOLD_transition_time_diagnosed
        last_GOLD_transition_time_diagnosed <- (all_events_diagnosed[i - 1, "local_time"]-all_events_diagnosed[i - 1, "time_at_diagnosis"])
      }
      if (all_events_diagnosed[i, "event"] == 14)
        Follow_up_GOLD_diagnosed[all_events_diagnosed[i, "gold"]] = Follow_up_GOLD_diagnosed[all_events_diagnosed[i, "gold"]] + (all_events_diagnosed[i, "local_time"]-all_events_diagnosed[i, "time_at_diagnosis"]) -
          last_GOLD_transition_time_diagnosed
    }

    #----------------------------UNDIAGNOSED ------------------------------------
    #-------------------------------------------------------------------------

    all_events_undiagnosed          <- subset(all_events, diagnosis == 0 & gold > 0 & gold < 3) #CanCOLD is only GOLD 1 and 2
    exac_events_undiagnosed         <- subset(all_events_undiagnosed, event == 5 )
    sev_exac_events_undiagnosed     <- subset(all_events_undiagnosed, event == 5 & (exac_status == 3 | exac_status == 4) )
    mod_sev_exac_events_undiagnosed <- subset(all_events_undiagnosed, event == 5 & (exac_status == 3 | exac_status == 4 | exac_status == 2) )
    exit_events_undiagnosed         <- subset(all_events_undiagnosed, event == 14)

    Follow_up_GOLD_undiagnosed <- c(0, 0, 0, 0)
    last_GOLD_transition_time_undiagnosed <- 0
    for (i in 2:dim(all_events_undiagnosed)[1]) {
      if ((all_events_undiagnosed[i, "id"] != all_events_undiagnosed[i - 1, "id"]))
        last_GOLD_transition_time_undiagnosed <- 0
      if ((all_events_undiagnosed[i, "id"] == all_events_undiagnosed[i - 1, "id"]) & (all_events_undiagnosed[i, "gold"] != all_events_undiagnosed[i - 1, "gold"])) {
        Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i - 1, "gold"]] = Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i - 1, "gold"]] + (all_events_undiagnosed[i - 1, "local_time"]-all_events_undiagnosed[i - 1, "time_at_diagnosis"]) -
          last_GOLD_transition_time_undiagnosed
        last_GOLD_transition_time_undiagnosed <- (all_events_undiagnosed[i - 1, "local_time"]-all_events_undiagnosed[i - 1, "time_at_diagnosis"])
      }
      if (all_events_undiagnosed[i, "event"] == 14)
        Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i, "gold"]] = Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i, "gold"]] + (all_events_undiagnosed[i, "local_time"]-all_events_undiagnosed[i, "time_at_diagnosis"]) -
          last_GOLD_transition_time_undiagnosed
    }

    terminate_session()

    #----------------------------All ------------------------------------
    #-------------------------------------------------------------------------

    message("Exacerbation Rates per GOLD stages for all patients:")

    GOLD_I   <- (as.data.frame(table(exac_events[, "gold"]))[1, 2]/Follow_up_GOLD[1])
    GOLD_II  <- (as.data.frame(table(exac_events[, "gold"]))[2, 2]/Follow_up_GOLD[2])
    GOLD_III <- (as.data.frame(table(exac_events[, "gold"]))[3, 2]/Follow_up_GOLD[3])
    GOLD_IV  <- (as.data.frame(table(exac_events[, "gold"]))[4, 2]/Follow_up_GOLD[4])

    message(paste0("exacRateGOLDI   = ", round(GOLD_I  , 2)))
    message(paste0("exacRateGOLDII  = ", round(GOLD_II , 2)))
    message(paste0("exacRateGOLDIII = ", round(GOLD_III, 2)))
    message(paste0("exacRateGOLDIV  = ", round(GOLD_IV , 2)))


    #----------------------------All ------------------------------------
    #-------------------------------------------------------------------------
    total_rate <- round(nrow(exac_events)/sum(Follow_up_GOLD), 2)
    Exac_per_GOLD <- matrix (NA, nrow = 3, ncol =3)
    colnames(Exac_per_GOLD) <- c("GOLD", "EPIC", "CanCOLD")
    # CanCOLD only available for GOLD 1 and 2. See doi: 10.1164/rccm.201509-1795OC

    Follow_up_GOLD_all_2level <- c(Follow_up_GOLD[1], Follow_up_GOLD[2]) # Because CanCOLD is mostly GOLD2, here we compare EPIC's GOLD2 only instead of GOLD2+
    #  Follow_up_GOLD_all_2level <- c(Follow_up_GOLD[1], sum(Follow_up_GOLD[2:4]))
    GOLD_counts_all       <- as.data.frame(table(exac_events[, "gold"]))[, 2]
    GOLD_counts_all       <- c(GOLD_counts_all[1], sum(GOLD_counts_all[2:4]))

    Exac_per_GOLD[1:3, 1] <- c("total", "gold1", "gold2+")
    Exac_per_GOLD[1:3, 2] <- c(total_rate,
                               round(x=GOLD_counts_all/Follow_up_GOLD_all_2level,
                                     digits = 2))
    Exac_per_GOLD[1:3, 3] <- c(0.39, 0.28, 0.53)

    df <- as.data.frame(Exac_per_GOLD)
    dfm <- melt(df[,c("GOLD", "EPIC", "CanCOLD")],id.vars = 1)
    plot <-
      ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
      scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
      theme_tufte(base_size=14, ticks=FALSE)  +
      geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
      ylab ("Rate") +
      labs(caption = "Total rate of exacerbations per year for all patients")

    print(plot)
    message("Total rate of exacerbation in all patients (0.39 per year in CanCOLD): ", total_rate)

    #--------------------------- total number of severe exacerbations:

    message("Is the rate of severe and very severe exacerbations around 510 per Ford et al. 2015?")
    n_exac <- data.frame(year= 1:20,Severe_Exacerbations = (output_ex$n_exac_by_ctime_severity[,3]+output_ex$n_exac_by_ctime_severity[,4])* (100000/rowSums(output_ex$n_alive_by_ctime_sex)))
    avgRate_sevExac <- mean(n_exac$Severe_Exacerbations[round(nrow(n_exac)/2,0):nrow(n_exac)])
    avgRate_sevExac <- round(avgRate_sevExac, 2)
    message(paste0("Average rate during 20 years: ", avgRate_sevExac))

    rate2017_sevExac <-(output_ex$n_exac_by_ctime_severity[3,3]+output_ex$n_exac_by_ctime_severity[3,4])*(100000/sum(output_ex$n_alive_by_ctime_sex[3,]))
    rate2017_sevExac <- round(rate2017_sevExac, 2)
    message(paste0("Rate in 2017: ", rate2017_sevExac))


    #----------------------------Diagnosed ------------------------------------
    #-------------------------------------------------------------------------

    Exac_per_GOLD_diagnosed <- matrix (NA, nrow = 4, ncol = 3)
    colnames(Exac_per_GOLD_diagnosed) <- c("GOLD", "EPIC", "Hoogendoorn")
    # ACCEPT data is rates from a join of ECLIPSE, MACRO, OPTIMAL and STATCOPE.
    Exac_per_GOLD_diagnosed[1:4, 1] <- c("gold1", "gold2", "gold3", "gold4")
    Exac_per_GOLD_diagnosed[1:4, 2] <- round(
      x=as.data.frame(table(exac_events_diagnosed[, "gold"]))[, 2]/
        Follow_up_GOLD_diagnosed, digits = 2)
    Exac_per_GOLD_diagnosed[1:4, 3] <- c(0.82, 1.17, 1.61, 2.10)

    df <- as.data.frame(Exac_per_GOLD_diagnosed)
    dfm <- melt(df[,c("GOLD", "EPIC", "Hoogendoorn")],id.vars = 1)
    plot <- ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
      scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
      theme_tufte(base_size=14, ticks=FALSE)  +
      geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
      ylab ("Rate") +
      labs(caption = "Total rate of exacerbations per year for diagnosed patients")
    print(plot)

    message("Total rate of exacerbation in diagnosed patients (1.5 per year in Hoogendoorn): ", round(nrow(exac_events_diagnosed)/sum(Follow_up_GOLD_diagnosed), 2))


    #----------------------------Diagnosed Moderate and Severe------------------------------------
    #-------------------------------------------------------------------------

    # Updated October 14, 2025
    Exac_per_GOLD_diagnosed <- matrix (NA, nrow = 4, ncol = 3)
    colnames(Exac_per_GOLD_diagnosed) <- c("GOLD", "EPIC", "Wallace 2019")

    Exac_per_GOLD_diagnosed[1:4, 1] <- c("gold1", "gold2", "gold3", "gold4")
    Exac_per_GOLD_diagnosed[1:4, 2] <- round(
      x=as.data.frame(table(mod_sev_exac_events_diagnosed[, "gold"]))[, 2]/
        Follow_up_GOLD_diagnosed, digits = 2)
    Exac_per_GOLD_diagnosed[1:4, 3] <- c(0.404, 0.489, 0.836, 0.891)

    df <- as.data.frame(Exac_per_GOLD_diagnosed)
    dfm <- melt(df[,c("GOLD", "EPIC", "Wallace 2019")],id.vars = 1)
    plot <-
      ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
      scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
      theme_tufte(base_size=14, ticks=FALSE)  +
      geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
      ylab ("Rate") +
      labs(caption = "Total rate of moderate/severe exacerbations per year for diagnosed patients")

    print(plot)


    #----------------------------Diagnosed Severe------------------------------------
    #-------------------------------------------------------------------------

    Exac_per_GOLD_diagnosed <- matrix (NA, nrow = 4, ncol = 3)
    colnames(Exac_per_GOLD_diagnosed) <- c("GOLD", "EPIC", "Wallace 2019")

    Exac_per_GOLD_diagnosed[1:4, 1] <- c("gold1", "gold2", "gold3", "gold4")
    Exac_per_GOLD_diagnosed[1:4, 2] <- round(
      x=as.data.frame(table(sev_exac_events_diagnosed[, "gold"]))[, 2]/
        Follow_up_GOLD_diagnosed, digits = 2)
    Exac_per_GOLD_diagnosed[1:4, 3] <- c(0.12, 0.139, 0.254, 0.422)

    df <- as.data.frame(Exac_per_GOLD_diagnosed)
    dfm <- melt(df[,c("GOLD", "EPIC", "Wallace 2019")],id.vars = 1)
    plot <-
      ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
      scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
      theme_tufte(base_size=14, ticks=FALSE)  +
      geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
      ylab ("Rate") +
      labs(caption = "Total rate of severe exacerbations per year for diagnosed patients")

    print(plot)

    #----------------------------Undiagnosed ------------------------------------
    #----------------------------------------------------------------------------
    total_rate_undiagnosed <- round(nrow(exac_events_undiagnosed)/sum(Follow_up_GOLD_undiagnosed), 2)
    Exac_per_GOLD_undiagnosed <- matrix (NA, nrow = 3, ncol = 3)
    colnames(Exac_per_GOLD_undiagnosed) <- c("GOLD", "EPIC", "CanCOLD")
    Exac_per_GOLD_undiagnosed[1:3, 1] <- c("total", "gold1", "gold2+")

    Follow_up_GOLD_undiagnosed_2level <- c(Follow_up_GOLD_undiagnosed[1],
                                           Follow_up_GOLD_undiagnosed[2]) #Because CanCOLD is mostly GOLD2, we compare to GOLD2 EPIC
    #Follow_up_GOLD_undiagnosed_2level <- c(Follow_up_GOLD_undiagnosed[1], sum(Follow_up_GOLD_undiagnosed[2:4]))
    GOLD_counts_undiagnosed   <- as.data.frame(table(exac_events_undiagnosed[, "gold"]))[, 2]
    GOLD_counts_undiagnosed   <- c(GOLD_counts_undiagnosed[1],
                                   GOLD_counts_undiagnosed[2])


    Exac_per_GOLD_undiagnosed[1:3, 2] <- c(total_rate_undiagnosed,
                                           round(x=GOLD_counts_undiagnosed/Follow_up_GOLD_undiagnosed_2level, digits = 2))
    Exac_per_GOLD_undiagnosed[1:3, 3] <- c(0.30, 0.24, 0.40)

    df <- as.data.frame(Exac_per_GOLD_undiagnosed)
    dfm <- melt(df[,c("GOLD", "EPIC", "CanCOLD")],id.vars = 1)
    plot <-
      ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
      scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
      theme_tufte(base_size=14, ticks=FALSE)  +
      geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
      ylab ("Rate") +
      labs(caption = "Total rate of exacerbations per year for undiagnosed patients")
    print(plot)

    message("Total rate of exacerbation in undiagnosed patients (0.30 per year in CanCOLD): ",
            total_rate_undiagnosed)

    return(invisible(NULL))
  }

  # Check for unsupported jurisdictions
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' and 'us' are supported.")
  }

  # Canada implementation (existing code)
  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_event"]
  #settings$agent_stack_size <- 0
  settings$n_base_agents <- base_agents
  #settings$event_stack_size <- 1
  init_session(settings = settings)
  if (is.null(input)) {input <- model_input$values}

  run(input = input)
  op <- get_output()
  output_ex <- get_output_ex()


  all_events <- as.data.frame(get_all_events_matrix())
  exac_events <- subset(all_events, event == 5)
  exit_events <- subset(all_events, event == 14)

  Follow_up_GOLD <- c(0, 0, 0, 0)
  last_GOLD_transition_time <- 0
  for (i in 2:dim(all_events)[1]) {
    if (all_events[i, "id"] != all_events[i - 1, "id"])
      last_GOLD_transition_time <- 0
    if ((all_events[i, "id"] == all_events[i - 1, "id"]) & (all_events[i, "gold"] != all_events[i - 1, "gold"])) {
      Follow_up_GOLD[all_events[i - 1, "gold"]] = Follow_up_GOLD[all_events[i - 1, "gold"]] + all_events[i, "followup_after_COPD"] -
        last_GOLD_transition_time
      last_GOLD_transition_time <- all_events[i, "followup_after_COPD"]
    }
    if (all_events[i, "event"] == 14)
      Follow_up_GOLD[all_events[i, "gold"]] = Follow_up_GOLD[all_events[i, "gold"]] + all_events[i, "followup_after_COPD"] -
        last_GOLD_transition_time
  }


  #----------------------------DIAGNOSED ------------------------------------
  #-------------------------------------------------------------------------

  all_events_diagnosed          <- subset(all_events, diagnosis > 0 & gold > 0 )
  exac_events_diagnosed         <- subset(all_events_diagnosed, event == 5 )
  sev_exac_events_diagnosed    <- subset(all_events_diagnosed, event == 5 & (exac_status == 3 | exac_status == 4) )
  mod_sev_exac_events_diagnosed <- subset(all_events_diagnosed, event == 5 & (exac_status == 3 | exac_status == 4 | exac_status == 2) )
  exit_events_diagnosed         <- subset(all_events_diagnosed, event == 14)

  Follow_up_GOLD_diagnosed <- c(0, 0, 0, 0)
  last_GOLD_transition_time_diagnosed <- 0
  for (i in 2:dim(all_events_diagnosed)[1]) {
    if ((all_events_diagnosed[i, "id"] != all_events_diagnosed[i - 1, "id"]))
      last_GOLD_transition_time_diagnosed <- 0
    if ((all_events_diagnosed[i, "id"] == all_events_diagnosed[i - 1, "id"]) & (all_events_diagnosed[i, "gold"] != all_events_diagnosed[i - 1, "gold"])) {
      Follow_up_GOLD_diagnosed[all_events_diagnosed[i - 1, "gold"]] = Follow_up_GOLD_diagnosed[all_events_diagnosed[i - 1, "gold"]] + (all_events_diagnosed[i, "local_time"]-all_events_diagnosed[i, "time_at_diagnosis"]) -
        last_GOLD_transition_time_diagnosed
      last_GOLD_transition_time_diagnosed <- (all_events_diagnosed[i, "local_time"]-all_events_diagnosed[i, "time_at_diagnosis"])
    }
    if (all_events_diagnosed[i, "event"] == 14)
      Follow_up_GOLD_diagnosed[all_events_diagnosed[i, "gold"]] = Follow_up_GOLD_diagnosed[all_events_diagnosed[i, "gold"]] + (all_events_diagnosed[i, "local_time"]-all_events_diagnosed[i, "time_at_diagnosis"]) -
        last_GOLD_transition_time_diagnosed
  }

  #----------------------------UNDIAGNOSED ------------------------------------
  #-------------------------------------------------------------------------

  all_events_undiagnosed          <- subset(all_events, diagnosis == 0 & gold > 0 & gold < 3) #CanCOLD is only GOLD 1 and 2
  exac_events_undiagnosed         <- subset(all_events_undiagnosed, event == 5 )
  sev_exac_events_undiagnosed     <- subset(all_events_undiagnosed, event == 5 & (exac_status == 3 | exac_status == 4) )
  mod_sev_exac_events_undiagnosed <- subset(all_events_undiagnosed, event == 5 & (exac_status == 3 | exac_status == 4 | exac_status == 2) )
  exit_events_undiagnosed         <- subset(all_events_undiagnosed, event == 14)

  Follow_up_GOLD_undiagnosed <- c(0, 0, 0, 0)
  last_GOLD_transition_time_undiagnosed <- 0
  for (i in 2:dim(all_events_undiagnosed)[1]) {
    if ((all_events_undiagnosed[i, "id"] != all_events_undiagnosed[i - 1, "id"]))
      last_GOLD_transition_time_undiagnosed <- 0
    if ((all_events_undiagnosed[i, "id"] == all_events_undiagnosed[i - 1, "id"]) & (all_events_undiagnosed[i, "gold"] != all_events_undiagnosed[i - 1, "gold"])) {
      Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i - 1, "gold"]] = Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i - 1, "gold"]] + (all_events_undiagnosed[i, "local_time"]-all_events_undiagnosed[i, "time_at_diagnosis"]) -
        last_GOLD_transition_time_undiagnosed
      last_GOLD_transition_time_undiagnosed <- (all_events_undiagnosed[i, "local_time"]-all_events_undiagnosed[i, "time_at_diagnosis"])
    }
    if (all_events_undiagnosed[i, "event"] == 14)
      Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i, "gold"]] = Follow_up_GOLD_undiagnosed[all_events_undiagnosed[i, "gold"]] + (all_events_undiagnosed[i, "local_time"]-all_events_undiagnosed[i, "time_at_diagnosis"]) -
        last_GOLD_transition_time_undiagnosed
  }

  terminate_session()

  #----------------------------All ------------------------------------
  #-------------------------------------------------------------------------

  message("Exacerbation Rates per GOLD stages for all patients:")

  # Count exacerbations by GOLD stage - use named vector to avoid indexing errors
  exac_counts_by_gold <- table(exac_events[, "gold"])

  GOLD_I   <- ifelse(Follow_up_GOLD[1] > 0,
                     as.numeric(exac_counts_by_gold["1"]) / Follow_up_GOLD[1],
                     0)
  GOLD_II  <- ifelse(Follow_up_GOLD[2] > 0,
                     as.numeric(exac_counts_by_gold["2"]) / Follow_up_GOLD[2],
                     0)
  GOLD_III <- ifelse(Follow_up_GOLD[3] > 0,
                     as.numeric(exac_counts_by_gold["3"]) / Follow_up_GOLD[3],
                     0)
  GOLD_IV  <- ifelse(Follow_up_GOLD[4] > 0,
                     as.numeric(exac_counts_by_gold["4"]) / Follow_up_GOLD[4],
                     0)

  # Replace NA with 0 (happens when there are no exacerbations for that GOLD stage)
  GOLD_I   <- ifelse(is.na(GOLD_I), 0, GOLD_I)
  GOLD_II  <- ifelse(is.na(GOLD_II), 0, GOLD_II)
  GOLD_III <- ifelse(is.na(GOLD_III), 0, GOLD_III)
  GOLD_IV  <- ifelse(is.na(GOLD_IV), 0, GOLD_IV)

  message(paste0("exacRateGOLDI   = ", round(GOLD_I  , 2)))
  message(paste0("exacRateGOLDII  = ", round(GOLD_II , 2)))
  message(paste0("exacRateGOLDIII = ", round(GOLD_III, 2)))
  message(paste0("exacRateGOLDIV  = ", round(GOLD_IV , 2)))


  #----------------------------All ------------------------------------
  #-------------------------------------------------------------------------
  total_rate <- round(nrow(exac_events)/sum(Follow_up_GOLD), 2)
  Exac_per_GOLD <- matrix (NA, nrow = 3, ncol =3)
  colnames(Exac_per_GOLD) <- c("GOLD", "EPIC", "CanCOLD")
  # CanCOLD only available for GOLD 1 and 2. See doi: 10.1164/rccm.201509-1795OC

  Follow_up_GOLD_all_2level <- c(Follow_up_GOLD[1], Follow_up_GOLD[2]) # Because CanCOLD is mostly GOLD2, here we compare EPIC's GOLD2 only instead of GOLD2+
#  Follow_up_GOLD_all_2level <- c(Follow_up_GOLD[1], sum(Follow_up_GOLD[2:4]))

  # Use named access to avoid indexing errors
  exac_table <- table(exac_events[, "gold"])
  GOLD_counts_all <- c(
    as.numeric(exac_table["1"]),
    sum(as.numeric(exac_table[c("2", "3", "4")]), na.rm = TRUE)
  )
  # Replace NA with 0
  GOLD_counts_all[is.na(GOLD_counts_all)] <- 0

  Exac_per_GOLD[1:3, 1] <- c("total", "gold1", "gold2+")
  Exac_per_GOLD[1:3, 2] <- c(total_rate,
                             round(x=GOLD_counts_all/Follow_up_GOLD_all_2level,
                                   digits = 2))
  Exac_per_GOLD[1:3, 3] <- c(0.39, 0.28, 0.53)

  df <- as.data.frame(Exac_per_GOLD)
  dfm <- melt(df[,c("GOLD", "EPIC", "CanCOLD")],id.vars = 1)
  plot <-
    ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
    scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
    theme_tufte(base_size=14, ticks=FALSE)  +
    geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
    ylab ("Rate") +
    labs(caption = "Total rate of exacerbations per year for all patients")

  print(plot)
  message("Total rate of exacerbation in all patients (0.39 per year in CanCOLD): ", total_rate)

  #--------------------------- total number of severe exacerbations:

  message("Is the rate of severe and very severe exacerbations around 477 per CIHI?")
  n_exac <- data.frame(year= 1:20,Severe_Exacerbations = (output_ex$n_exac_by_ctime_severity[,3]+output_ex$n_exac_by_ctime_severity[,4])* (100000/rowSums(output_ex$n_alive_by_ctime_sex)))
  avgRate_sevExac <- mean(n_exac$Severe_Exacerbations[round(nrow(n_exac)/2,0):nrow(n_exac)])
  avgRate_sevExac <- round(avgRate_sevExac, 2)
  message(paste0("Average rate during 20 years: ", avgRate_sevExac))

  rate2017_sevExac <-(output_ex$n_exac_by_ctime_severity[3,3]+output_ex$n_exac_by_ctime_severity[3,4])*(100000/sum(output_ex$n_alive_by_ctime_sex[3,]))
  rate2017_sevExac <- round(rate2017_sevExac, 2)
  message(paste0("Rate in 2017: ", rate2017_sevExac))


  #----------------------------Diagnosed ------------------------------------
  #-------------------------------------------------------------------------

  Exac_per_GOLD_diagnosed <- matrix (NA, nrow = 4, ncol = 3)
  colnames(Exac_per_GOLD_diagnosed) <- c("GOLD", "EPIC", "Hoogendoorn")
  # ACCEPT data is rates from a join of ECLIPSE, MACRO, OPTIMAL and STATCOPE.
  Exac_per_GOLD_diagnosed[1:4, 1] <- c("gold1", "gold2", "gold3", "gold4")

  # Use named access to avoid indexing errors
  exac_diag_table <- table(exac_events_diagnosed[, "gold"])
  exac_diag_rates <- sapply(1:4, function(i) {
    count <- as.numeric(exac_diag_table[as.character(i)])
    if (is.na(count)) count <- 0
    if (Follow_up_GOLD_diagnosed[i] > 0) {
      return(count / Follow_up_GOLD_diagnosed[i])
    } else {
      return(0)
    }
  })
  Exac_per_GOLD_diagnosed[1:4, 2] <- round(exac_diag_rates, digits = 2)
  Exac_per_GOLD_diagnosed[1:4, 3] <- c(0.82, 1.17, 1.61, 2.10)

  df <- as.data.frame(Exac_per_GOLD_diagnosed)
  dfm <- melt(df[,c("GOLD", "EPIC", "Hoogendoorn")],id.vars = 1)
  plot <- ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
    scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
    theme_tufte(base_size=14, ticks=FALSE)  +
    geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
    ylab ("Rate") +
    labs(caption = "Total rate of exacerbations per year for diagnosed patients")
  print(plot)

  message("Total rate of exacerbation in diagnosed patients (1.5 per year in Hoogendoorn): ", round(nrow(exac_events_diagnosed)/sum(Follow_up_GOLD_diagnosed), 2))


  #----------------------------Diagnosed Moderate and Severe------------------------------------
  #-------------------------------------------------------------------------

  Exac_per_GOLD_diagnosed <- matrix (NA, nrow = 4, ncol = 3)
  colnames(Exac_per_GOLD_diagnosed) <- c("GOLD", "EPIC", "ACCEPT")
  # ACCEPT data is rates from a join of ECLIPSE, MACRO, OPTIMAL and STATCOPE.
  Exac_per_GOLD_diagnosed[1:4, 1] <- c("gold1", "gold2", "gold3", "gold4")

  # Use named access to avoid indexing errors
  mod_sev_diag_table <- table(mod_sev_exac_events_diagnosed[, "gold"])
  mod_sev_diag_rates <- sapply(1:4, function(i) {
    count <- as.numeric(mod_sev_diag_table[as.character(i)])
    if (is.na(count)) count <- 0
    if (Follow_up_GOLD_diagnosed[i] > 0) {
      return(count / Follow_up_GOLD_diagnosed[i])
    } else {
      return(0)
    }
  })
  Exac_per_GOLD_diagnosed[1:4, 2] <- round(mod_sev_diag_rates, digits = 2)
  Exac_per_GOLD_diagnosed[1:4, 3] <- c(0.58, 0.91, 1.41, 1.69)

  df <- as.data.frame(Exac_per_GOLD_diagnosed)
  dfm <- melt(df[,c("GOLD", "EPIC", "ACCEPT")],id.vars = 1)
  plot <-
    ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
    scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
    theme_tufte(base_size=14, ticks=FALSE)  +
    geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
    ylab ("Rate") +
    labs(caption = "Total rate of moderate/severe exacerbations per year for diagnosed patients")

  print(plot)


  #----------------------------Diagnosed Severe------------------------------------
  #-------------------------------------------------------------------------

  Exac_per_GOLD_diagnosed <- matrix (NA, nrow = 4, ncol = 4)
  colnames(Exac_per_GOLD_diagnosed) <- c("GOLD", "EPIC", "Hoogendoorn", "ACCEPT")
  # ACCEPT data is rates from a join of ECLIPSE, MACRO, OPTIMAL and STATCOPE.
  Exac_per_GOLD_diagnosed[1:4, 1] <- c("gold1", "gold2", "gold3", "gold4")

  # Use named access to avoid indexing errors
  sev_diag_table <- table(sev_exac_events_diagnosed[, "gold"])
  sev_diag_rates <- sapply(1:4, function(i) {
    count <- as.numeric(sev_diag_table[as.character(i)])
    if (is.na(count)) count <- 0
    if (Follow_up_GOLD_diagnosed[i] > 0) {
      return(count / Follow_up_GOLD_diagnosed[i])
    } else {
      return(0)
    }
  })
  Exac_per_GOLD_diagnosed[1:4, 2] <- round(sev_diag_rates, digits = 2)

  Exac_per_GOLD_diagnosed[1:4, 3] <- c(0.11, 0.16, 0.22, 0.28)
  Exac_per_GOLD_diagnosed[1:4, 4] <- c(0.10, 0.14, 0.32, 0.42)

  df <- as.data.frame(Exac_per_GOLD_diagnosed)
  dfm <- melt(df[,c("GOLD", "EPIC", "Hoogendoorn", "ACCEPT")],id.vars = 1)
  plot <-
    ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
    scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
    theme_tufte(base_size=14, ticks=FALSE)  +
    geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
    ylab ("Rate") +
    labs(caption = "Total rate of severe exacerbations per year for diagnosed patients")

  print(plot)

  #----------------------------Undiagnosed ------------------------------------
  #----------------------------------------------------------------------------
  total_rate_undiagnosed <- round(nrow(exac_events_undiagnosed)/sum(Follow_up_GOLD_undiagnosed), 2)
  Exac_per_GOLD_undiagnosed <- matrix (NA, nrow = 3, ncol = 3)
  colnames(Exac_per_GOLD_undiagnosed) <- c("GOLD", "EPIC", "CanCOLD")
  Exac_per_GOLD_undiagnosed[1:3, 1] <- c("total", "gold1", "gold2+")

  Follow_up_GOLD_undiagnosed_2level <- c(Follow_up_GOLD_undiagnosed[1],
                                         Follow_up_GOLD_undiagnosed[2]) #Because CANCold is mostly GOLD2, we comprare to GOLD2 EPIC
  #Follow_up_GOLD_undiagnosed_2level <- c(Follow_up_GOLD_undiagnosed[1], sum(Follow_up_GOLD_undiagnosed[2:4]))

  # Use named access to avoid indexing errors
  exac_undiag_table <- table(exac_events_undiagnosed[, "gold"])
  GOLD_counts_undiagnosed <- c(
    as.numeric(exac_undiag_table["1"]),
    as.numeric(exac_undiag_table["2"])
  )
  # Replace NA with 0
  GOLD_counts_undiagnosed[is.na(GOLD_counts_undiagnosed)] <- 0


  Exac_per_GOLD_undiagnosed[1:3, 2] <- c(total_rate_undiagnosed,
    round(x=GOLD_counts_undiagnosed/Follow_up_GOLD_undiagnosed_2level, digits = 2))
  Exac_per_GOLD_undiagnosed[1:3, 3] <- c(0.30, 0.24, 0.40)

  df <- as.data.frame(Exac_per_GOLD_undiagnosed)
  dfm <- melt(df[,c("GOLD", "EPIC", "CanCOLD")],id.vars = 1)
  plot <-
   ggplot(dfm, aes(x = GOLD, y = as.numeric(value))) +
    scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
    theme_tufte(base_size=14, ticks=FALSE)  +
    geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
    ylab ("Rate") +
    labs(caption = "Total rate of exacerbations per year for undiagnosed patients")
  print(plot)

  message("Total rate of exacerbation in undiagnosed patients (0.30 per year in CanCOLD): ",
          total_rate_undiagnosed)

  #----------------------------By Sex ------------------------------------
  #-----------------------------------------------------------------------

  message("\n")
  message("Exacerbation rates (per COPD patient-year) by sex over time:\n")

  # Extract n_exac_by_ctime_sex data (number of exacerbations)
  # This is a matrix with dimensions [time_horizon][2] where columns are sex (0=male, 1=female)
  n_exac <- as.data.frame(output_ex$n_exac_by_ctime_sex)

  # Extract n_COPD_by_ctime_sex data (COPD patient-years at risk)
  n_COPD <- as.data.frame(output_ex$n_COPD_by_ctime_sex)

  # Calculate rate per COPD patient-year
  exac_rate <- n_exac / n_COPD
  colnames(exac_rate) <- c("Male", "Female")
  exac_rate$Year <- 1:nrow(exac_rate)

  # Reshape data for ggplot
  exac_rate_long <- reshape2::melt(exac_rate, id.vars = "Year",
                                   variable.name = "Sex",
                                   value.name = "Exacerbation_Rate")

  # Create the plot
  plot <- ggplot2::ggplot(exac_rate_long, ggplot2::aes(x = .data$Year, y = .data$Exacerbation_Rate, color = .data$Sex)) +
    ggplot2::geom_line(linewidth = 1) +
    ggplot2::geom_point(size = 2) +
    theme_bw(base_size = 14) +
    labs(title = "Exacerbation Rate by Sex Over Time",
         x = "Year",
         y = "Exacerbation Rate (per COPD patient-year)",
         color = "Sex") +
    theme(legend.position = "bottom",
          plot.title = element_text(hjust = 0.5))

  print(plot)

  # Print summary statistics
  message("\nSummary statistics:")
  message("Total exacerbations (Male): ", sum(n_exac[, 1]))
  message("Total exacerbations (Female): ", sum(n_exac[, 2]))
  message("Total COPD patient-years (Male): ", round(sum(n_COPD[, 1]), 2))
  message("Total COPD patient-years (Female): ", round(sum(n_COPD[, 2]), 2))
  message("Mean exacerbation rate per COPD patient-year (Male): ", round(mean(exac_rate$Male, na.rm = TRUE), 4))
  message("Mean exacerbation rate per COPD patient-year (Female): ", round(mean(exac_rate$Female, na.rm = TRUE), 4))
  message("Overall exacerbation rate per COPD patient-year (Male): ", round(sum(n_exac[, 1]) / sum(n_COPD[, 1]), 4))
  message("Overall exacerbation rate per COPD patient-year (Female): ", round(sum(n_exac[, 2]) / sum(n_COPD[, 2]), 4))

}


#' Returns the Kaplan Meier curve comparing COPD and non-COPD
#' @param savePlots TRUE or FALSE (default), exports 300 DPI population growth and pyramid plots comparing simulated vs. predicted population
#' @param base_agents Number of agents in the simulation. Default is 1e4.
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_survival <- function(savePlots = FALSE, base_agents=1e4, jurisdiction = "canada") {

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  if (!requireNamespace("survival", quietly = TRUE)) {
    stop("Package \"survival\" needed for this function to work. Please install it.",
         call. = FALSE)
  }

  if (!requireNamespace("survminer", quietly = TRUE)) {
    stop("Package \"survminer\" needed for this function to work. Please install it.",
         call. = FALSE)
  }


  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_event"]
  #settings$agent_stack_size <- 0
  settings$n_base_agents <- base_agents
  #settings$event_stack_size <- 1
  init_session(settings = settings)
  input <- model_input$values  #We can work with local copy more conveniently and submit it to the Run function

  run(input = input)
  events <- as.data.frame(get_all_events_matrix())
  terminate_session()

  cohort <- subset(events, ((event==7) | (event==13) | (event==14)))

  cohort <- cohort %>% filter((id==lead(id) | ((event == 14) & id!=lag(id))))

  cohort$copd <- (cohort$gold>0)
  cohort$death <- (cohort$event!=14)
  cohort$age <- (cohort$age_at_creation+cohort$local_time)

  #fit <- survfit(Surv(age, death) ~ copd, data=cohort)
  fit <- survival::survfit(Surv(age, death) ~ copd, data=cohort)

  # Customized survival curves
  surv_plot <- survminer::ggsurvplot(fit, data = cohort, censor.shape="", censor.size = 1,
                          surv.median.line = "hv", # Add medians survival

                          # Change legends: title & labels
                          legend.title = "Disease Status",
                          legend.labs = c("Non-COPD", "COPD"),
                          # Add p-value and tervals
                          pval = TRUE,

                          conf.int = TRUE,
                          xlim = c(40,110),         # present narrower X axis, but not affect
                          # survival estimates.
                          xlab = "Age",   # customize X axis label.
                          break.time.by = 20,     # break X axis in time intervals by 500.
                          # Add risk table
                          #risk.table = TRUE,
                          tables.height = 0.2,
                          tables.theme = theme_cleantable(),

                          # Color palettes. Use custom color: c("#E7B800", "#2E9FDF"),
                          # or brewer color (e.g.: "Dark2"), or ggsci color (e.g.: "jco")
                          #palette = c("gray0", "gray1"),
                          ggtheme = theme_tufte() +
                            theme(axis.line = element_line(colour = "black"),
                                  panel.grid.major = element_blank(),
                                  panel.grid.minor = element_blank(),
                                  panel.border = element_blank(),
                                  panel.background = element_blank())  # Change ggplot2 theme
  )

  plot (surv_plot)

  if (savePlots) ggsave(file.path(tempdir(), paste0("survival-diagnosed", ".tiff")), plot = plot(surv_plot), device = "tiff", dpi = 300)

  fitcox <- coxph(Surv(age, death) ~ copd, data = cohort)
  ftest <- cox.zph(fitcox)
  message(paste(capture.output(summary(fitcox)), collapse = "\n"))

  return(surv_plot)
}


#' Returns results of validation tests for diagnosis
#'
#' This function returns a table showing the proportion of COPD patients diagnosed
#' over the model's runtime. It also provides a second table that breaks down the proportion
#' of diagnosed patients by COPD severity. Additionally, the function generates a plot
#' to visualize the distribution of diagnoses over time.
#'
#' @param n_sim number of agents
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_diagnosis <- function(n_sim = 1e+04, jurisdiction = "canada") {
  message("Let's take a look at diagnosis\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  res <- run(input = input)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs <- get_inputs()
  output_ex <- get_output_ex()

  message("Here are the proportion of COPD patients diagnosed over model time: \n")

  diag <- data.frame(Year=1:inputs$global_parameters$time_horizon,
                     COPD=rowSums(output_ex$n_COPD_by_ctime_sex),
                     Diagnosed=rowSums(output_ex$n_Diagnosed_by_ctime_sex))

  diag$Proportion <- round(diag$Diagnosed/diag$COPD,2)

  message(paste(capture.output(diag), collapse = "\n"))

  message("The average proportion diagnosed from year", round(length(diag$Proportion)/2,0), "to", length(diag$Proportion), "is",
      mean(diag$Proportion[(round(length(diag$Proportion)/2,0)):(length(diag$Proportion))]),"\n")

  diag.plot <- tidyr::gather(data=diag, key="Variable", value="Number", c(COPD,Diagnosed))

  diag.plotted <- ggplot2::ggplot(diag.plot, aes(x=Year, y=Number, col=Variable)) +
                  geom_line() + geom_point() + expand_limits(y = 0) +
                  theme_bw() + ylab("Number of COPD patients") + xlab("Years")

  plot(diag.plotted)

  message("\n")
  message("Now let's look at the proportion diagnosed by COPD severity.\n")

  prop <- data.frame(Year=1:inputs$global_parameters$time_horizon,
                     output_ex$n_Diagnosed_by_ctime_severity/output_ex$n_COPD_by_ctime_severity)[,c(1,3,4,5,6)]

  names(prop) <- c("Year","GOLD1","GOLD2","GOLD3","GOLD4")
  prop <- prop[-1,]
  message(paste(capture.output(prop), collapse = "\n"))

  message("The average proportion of GOLD 1 and 2 that are diagnosed from year", round(nrow(prop)/2,0), "to", max(prop$Year), "is",
      (mean(prop$GOLD1[round((nrow(prop)/2),0):nrow(prop)]) + mean(prop$GOLD2[round((nrow(prop)/2),0):nrow(prop)]))/2,"\n")

  prop.plot <- tidyr::gather(data=prop, key="GOLD", value="Proportion", c(GOLD1:GOLD4))

  prop.plotted <- ggplot2::ggplot(prop.plot, aes(x=Year, y=Proportion, col=GOLD)) +
                    geom_line() + geom_point() + expand_limits(y = 0) +
                    theme_bw() + ylab("Proportion diagnosed") + xlab("Years")

  plot(prop.plotted)

  terminate_session()
}

#' Returns results of validation tests for GP visits
#'
#' This function returns Average number of GP visits by sex, COPD severity and
#' COPD diagnosis status along with their plots.
#' @param n_sim number of agents
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_gpvisits <- function(n_sim = 1e+04, jurisdiction = "canada") {
  message("Let's take a look at GP visits\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  res <- run(input = input)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs <- get_inputs()
  output_ex <- get_output_ex()

  message("\n")
  message("Here is the Average number of GP visits by sex:\n")

  GPSex <- data.frame(1:inputs$global_parameters$time_horizon,
             output_ex$n_GPvisits_by_ctime_sex/output_ex$n_alive_by_ctime_sex)

  names(GPSex) <- c("Year","Male","Female")

  message(paste(capture.output(GPSex), collapse = "\n"))

  GPSex.plot <- tidyr::gather(data=GPSex, key="Sex", value="Visits", c(Male,Female))

  GPSex.plot <- subset(GPSex.plot, Year!=1)

  GPSex.plotted <- ggplot2::ggplot(GPSex.plot, aes(x=Year, y=Visits, col=Sex)) +
                      geom_line() + geom_point() + expand_limits(y = 0) +
                      theme_bw() + ylab("Average GP visits/year") + xlab("Years")

  plot(GPSex.plotted)

  message("\n")

  message("Here is the Average number of GP visits by COPD severity:\n")

  GPCOPD <- data.frame(1:inputs$global_parameters$time_horizon,
                      output_ex$n_GPvisits_by_ctime_severity/output_ex$cumul_time_by_ctime_GOLD)

  names(GPCOPD) <- c("Year","NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  message(paste(capture.output(GPCOPD[-1,]), collapse = "\n"))


  GPCOPD.plot <- tidyr::gather(data=GPCOPD, key="COPD", value="Visits", c(NoCOPD:GOLD4))

  GPCOPD.plot <- subset(GPCOPD.plot, Year!=1)

  GPCOPD.plotted <- ggplot2::ggplot(GPCOPD.plot, aes(x=Year, y=Visits, col=COPD)) +
                        geom_line() + geom_point() + expand_limits(y = 0) +
                        theme_bw() + ylab("Average GP visits/year") + xlab("Years")

  plot(GPCOPD.plotted)

  message("\n")

  message("Here is the Average number of GP visits by COPD diagnosis status:\n")

  Diagnosed <- rowSums(output_ex$n_Diagnosed_by_ctime_sex)
  Undiagnosed <- rowSums(output_ex$cumul_time_by_ctime_GOLD[,2:5]) - Diagnosed
  data <- cbind(Undiagnosed, Diagnosed)

  GPDiag<- data.frame(Year=1:inputs$global_parameters$time_horizon,
                       output_ex$n_GPvisits_by_ctime_diagnosis/data)

  message(paste(capture.output(GPDiag[-1,]), collapse = "\n"))

  GPDiag.plot <- tidyr::gather(data=GPDiag, key="Diagnosis", value="Visits", c(Undiagnosed,Diagnosed))

  GPDiag.plot <- subset(GPDiag.plot, Year!=1)

  GPDiag.plotted <- ggplot2::ggplot(GPDiag.plot, aes(x=Year, y=Visits, col=Diagnosis)) +
                        geom_line() + geom_point() + expand_limits(y = 0) +
                        theme_bw() + ylab("Average GP visits/year") + xlab("Years")

  plot(GPDiag.plotted)

  message("\n")

  terminate_session()
}

#' Returns results of validation tests for Symptoms
#'
#' This function plots the prevalence of cough, dyspnea, phlegm and wheeze
#' over time and by GOLD stage.
#' @param n_sim number of agents
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_symptoms <- function(n_sim = 1e+04, jurisdiction = "canada") {
  message("Let's take a look at symptoms\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  res <- run(input = input)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs <- get_inputs()
  output_ex <- get_output_ex()

  # COUGH
  message("\n")
  message("I'm going to plot the prevalence of each symptom over time and by GOLD stage\n")
  message("\n")
  message("Cough:\n")
  message("\n")

  cough <- data.frame(1:inputs$global_parameters$time_horizon,
                      output_ex$n_cough_by_ctime_severity/output_ex$n_COPD_by_ctime_severity)

  names(cough) <- c("Year","NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  message(paste(capture.output(cough), collapse = "\n"))

  # plot
  cough.plot <- tidyr::gather(data=cough, key="GOLD", value="Prevalence", NoCOPD:GOLD4)
  cough.plot$Symptom <- "cough"

  cough.plotted <- ggplot2::ggplot(cough.plot, aes(x=Year, y=Prevalence, col=GOLD)) +
                      geom_smooth(method=lm, formula = y~x, level=0) + geom_point() + expand_limits(y = 0) +
                      theme_bw() + ylab("Proportion with cough") + xlab("Model Year")

  #plot(cough.plotted)

  message("\n")

  # PHLEGM
  message("Phlegm:\n")
  message("\n")

  phlegm <- data.frame(1:inputs$global_parameters$time_horizon,
                      output_ex$n_phlegm_by_ctime_severity/output_ex$n_COPD_by_ctime_severity)

  names(phlegm) <- c("Year","NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  message(paste(capture.output(phlegm), collapse = "\n"))

  # plot
  phlegm.plot <- tidyr::gather(data=phlegm, key="GOLD", value="Prevalence", NoCOPD:GOLD4)
  phlegm.plot$Symptom <- "phlegm"

  phlegm.plotted <- ggplot2::ggplot(phlegm.plot, aes(x=Year, y=Prevalence, col=GOLD)) +
    geom_smooth(method=lm, formula = y~x, level=0) + geom_point() + expand_limits(y = 0) +
    theme_bw() + ylab("Proportion with phlegm") + xlab("Model Year")

  #plot(phlegm.plotted)

  message("\n")

  # WHEEZE
  message("Wheeze:\n")
  message("\n")

  wheeze <- data.frame(1:inputs$global_parameters$time_horizon,
                       output_ex$n_wheeze_by_ctime_severity/output_ex$n_COPD_by_ctime_severity)

  names(wheeze) <- c("Year","NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  message(paste(capture.output(wheeze), collapse = "\n"))

  # plot
  wheeze.plot <- tidyr::gather(data=wheeze, key="GOLD", value="Prevalence", NoCOPD:GOLD4)
  wheeze.plot$Symptom <- "wheeze"

  wheeze.plotted <- ggplot2::ggplot(wheeze.plot, aes(x=Year, y=Prevalence, col=GOLD)) +
    geom_smooth(method=lm, formula = y~x, level=0) + geom_point() + expand_limits(y = 0) +
    theme_bw() + ylab("Proportion with wheeze") + xlab("Model Year")

  #plot(wheeze.plotted)

  message("\n")

  # DYSPNEA
  message("Dyspnea:\n")
  message("\n")

  dyspnea <- data.frame(1:inputs$global_parameters$time_horizon,
                       output_ex$n_dyspnea_by_ctime_severity/output_ex$n_COPD_by_ctime_severity)

  names(dyspnea) <- c("Year","NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  message(paste(capture.output(dyspnea), collapse = "\n"))

  # plot
  dyspnea.plot <- tidyr::gather(data=dyspnea, key="GOLD", value="Prevalence", NoCOPD:GOLD4)
  dyspnea.plot$Symptom <- "dyspnea"

  dyspnea.plotted <- ggplot2::ggplot(dyspnea.plot, aes(x=Year, y=Prevalence, col=GOLD)) +
    geom_smooth(method=lm, formula = y~x, level=0) + geom_point() + expand_limits(y = 0) +
    theme_bw() + ylab("Proportion with dyspnea") + xlab("Model Year")

  #plot(dyspnea.plotted)

  message("\n")
  message("All symptoms plotted together:\n")

  all.plot <- rbind(cough.plot, phlegm.plot, wheeze.plot, dyspnea.plot)

  all.plotted <- ggplot2::ggplot(all.plot, aes(x=Year, y=Prevalence, col=GOLD)) +
    geom_smooth(method=lm, formula = y~x, level=0) + geom_point() + facet_wrap(~Symptom) +
    expand_limits(y = 0) +  theme_bw() + ylab("Proportion with symptom") + xlab("Model Year")

  plot(all.plotted)

  terminate_session()
}

#' Returns results of validation tests for Treatment
#'
#' This function runs validation tests to examine how treatment initiated at diagnosis
#' influences exacerbation rates in COPD patients. It compares exacerbation rates between
#' diagnosed and undiagnosed patients and assesses the impact of treatment.
#' @param n_sim number of agents
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_treatment<- function(n_sim = 1e+04, jurisdiction = "canada") {
  message("Let's make sure that treatment (which is initiated at diagnosis) is affecting the exacerbation rate.\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  res <- run(input = input)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs <- get_inputs()
  output_ex <- get_output_ex()

  message("\n")
  message("Exacerbation rate for undiagnosed COPD patients.\n")
  message("\n")

  undiagnosed <- data.frame(cbind(1:inputs$global_parameters$time_horizon, output_ex$n_exac_by_ctime_severity_undiagnosed/
                                    (rowSums(output_ex$n_COPD_by_ctime_severity[,-1]) - rowSums(output_ex$n_Diagnosed_by_ctime_sex))))

  names(undiagnosed) <- c("Year","Mild","Moderate","Severe","VerySevere")
  message(paste(capture.output(undiagnosed), collapse = "\n"))
  undiagnosed$Diagnosis <- "undiagnosed"

  message("\n")
  message("Exacerbation rate for diagnosed COPD patients.\n")
  message("\n")

  diagnosed <- data.frame(cbind(1:inputs$global_parameters$time_horizon,
                                output_ex$n_exac_by_ctime_severity_diagnosed/rowSums(output_ex$n_Diagnosed_by_ctime_sex)))

  diagnosed[1,2:5] <- c(0,0,0,0)
  names(diagnosed) <- c("Year","Mild","Moderate","Severe","VerySevere")
  message(paste(capture.output(diagnosed), collapse = "\n"))
  diagnosed$Diagnosis <- "diagnosed"

  # plot
  exac.plot <- tidyr::gather(data=rbind(undiagnosed, diagnosed), key="Exacerbation", value="Rate", Mild:VerySevere)

  exac.plotted <- ggplot2::ggplot(exac.plot, aes(x=Year, y=Rate, fill=Diagnosis)) +
                      geom_bar(stat="identity", position="dodge") + facet_wrap(~Exacerbation, labeller=label_both) +
                      scale_y_continuous(expand = c(0, 0)) +
                      xlab("Model Year") + ylab("Annual rate of exacerbations") + theme_bw()

  plot(exac.plotted)

  message("\n")
  terminate_session()

  ###
  message("\n")
  message("Now, set the treatment effects to 0 and make sure the number of exacerbations increased among diagnosed patients.\n")
  message("\n")

  init_session(settings = settings)

  input_nt <- model_input$values

  input_nt$medication$medication_ln_hr_exac <- rep(0, length(inputs$medication$medication_ln_hr_exac))

  res <- run(input = input_nt)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs_nt <- get_inputs()
  output_ex_nt <- get_output_ex()

  exac.diff <- data.frame(cbind(1:inputs_nt$global_parameters$time_horizon,
                          output_ex_nt$n_exac_by_ctime_severity_diagnosed - output_ex$n_exac_by_ctime_severity_diagnosed))

  names(exac.diff) <- c("Year","Mild","Moderate","Severe","VerySevere")

  message("Without treatment, there was an average of:\n")
  message(mean(exac.diff$Mild),"more mild exacerbations,\n")
  message(mean(exac.diff$Moderate),"more moderate exacerbations,\n")
  message(mean(exac.diff$Severe),"more severe exacerbations, and\n")
  message(mean(exac.diff$VerySevere),"more very severe exacerbations per year.\n")

  ###
  message("\n")
  message("Now, set all COPD patients to diagnosed, then undiagnosed, and compare the exacerbation rates.\n")
  message("\n")

  init_session(settings = settings)

  input_nd <- model_input$values

  input_nd$diagnosis$logit_p_prevalent_diagnosis_by_sex <- cbind(male=c(intercept=-100, age=-0.0152, smoking=0.1068, fev1=-0.6146,
                                                                            cough=0.075, phlegm=0.283, wheeze=-0.0275, dyspnea=0.5414,
                                                                            case_detection=0),
                                                                     female=c(intercept=-100-0.1638, age=-0.0152, smoking=0.1068, fev1=-0.6146,
                                                                              cough=0.075, phlegm=0.283, wheeze=-0.0275, dyspnea=0.5414,
                                                                              case_detection=0))

  input_nd$diagnosis$p_hosp_diagnosis <- 0

  input_nd$diagnosis$logit_p_diagnosis_by_sex <- cbind(male=c(intercept=-100, age=-0.0324, smoking=0.3711, fev1=-0.8032,
                                                                  gpvisits=0.0087, cough=0.208, phlegm=0.4088, wheeze=0.0321, dyspnea=0.722,
                                                                  case_detection=0),
                                                           female=c(intercept=-100-0.4873, age=-0.0324, smoking=0.3711, fev1=-0.8032,
                                                                    gpvisits=0.0087, cough=0.208, phlegm=0.4088, wheeze=0.0321, dyspnea=0.722,
                                                                    case_detection=0))

  input_nd$diagnosis$logit_p_overdiagnosis_by_sex <- cbind(male=c(intercept=-100, age=0.0025, smoking=0.6911, gpvisits=0.0075,
                                                                      cough=0.7264, phlegm=0.7956, wheeze=0.66, dyspnea=0.8798,
                                                                      case_detection=0),
                                                               female=c(intercept=-100+0.2597, age=0.0025, smoking=0.6911, gpvisits=0.0075,
                                                                        cough=0.7264, phlegm=0.7956, wheeze=0.66, dyspnea=0.8798,
                                                                        case_detection=0))

  res <- run(input = input_nd)
  if (res < 0)
    stop("Execution stopped.\n")

  output_ex_nd <- get_output_ex()

  exac_rate_nodiag <- rowSums(output_ex_nd$n_exac_by_ctime_severity)/rowSums(output_ex_nd$n_COPD_by_ctime_sex)

  terminate_session()

  ###

  init_session(settings = settings)

  input_d <- model_input$values

  input_d$diagnosis$logit_p_prevalent_diagnosis_by_sex <- cbind(male=c(intercept=100, age=-0.0152, smoking=0.1068, fev1=-0.6146,
                                                                        cough=0.075, phlegm=0.283, wheeze=-0.0275, dyspnea=0.5414,
                                                                        case_detection=0),
                                                                 female=c(intercept=100-0.1638, age=-0.0152, smoking=0.1068, fev1=-0.6146,
                                                                          cough=0.075, phlegm=0.283, wheeze=-0.0275, dyspnea=0.5414,
                                                                          case_detection=0))

  input_d$diagnosis$p_hosp_diagnosis <- 1

  input_d$diagnosis$logit_p_diagnosis_by_sex <- cbind(male=c(intercept=100, age=-0.0324, smoking=0.3711, fev1=-0.8032,
                                                              gpvisits=0.0087, cough=0.208, phlegm=0.4088, wheeze=0.0321, dyspnea=0.722,
                                                              case_detection=0),
                                                       female=c(intercept=100-0.4873, age=-0.0324, smoking=0.3711, fev1=-0.8032,
                                                                gpvisits=0.0087, cough=0.208, phlegm=0.4088, wheeze=0.0321, dyspnea=0.722,
                                                                case_detection=0))


  res <- run(input = input_d)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs_d <- get_inputs()
  output_ex_d <- get_output_ex()

  exac_rate_diag <- rowSums(output_ex_d$n_exac_by_ctime_severity)/rowSums(output_ex_d$n_COPD_by_ctime_sex)

  ##
  message("Annual exacerbation rate (this is also plotted):\n")
  message("\n")

  trt_effect<- data.frame(Year=1:inputs_d$global_parameters$time_horizon,
                          Diagnosed = exac_rate_diag,
                          Undiagnosed = exac_rate_nodiag)

  trt_effect$Delta <- (trt_effect$Undiagnosed - trt_effect$Diagnosed)/trt_effect$Undiagnosed

  message(paste(capture.output(trt_effect), collapse = "\n"))

  message("\n")
  message("Treatment reduces the rate of exacerbations by a mean of:", mean(trt_effect$Delta),"\n")

  # plot
  trt.plot <- tidyr::gather(data=trt_effect, key="Diagnosis", value="Rate", Diagnosed:Undiagnosed)

  trt.plotted <- ggplot2::ggplot(trt.plot, aes(x=Year, y=Rate, col=Diagnosis)) +
                            geom_line() + geom_point() + expand_limits(y = 0) +
                            theme_bw() + ylab("Annual exacerbation rate") + xlab("Years")

  plot(trt.plotted)

  terminate_session()
}

#' Returns results of Case Detection strategies
#' @param n_sim number of agents
#' @param p_of_CD probability of recieving case detection given that an agent meets the selection criteria
#' @param min_age minimum age that can recieve case detection
#' @param min_pack_years minimum pack years that can recieve case detection
#' @param only_smokers set to 1 if only smokers should recieve case detection
#' @param CD_method Choose one case detection method: CDQ195", "CDQ165", "FlowMeter", "FlowMeter_CDQ"
#' @return results of case detection strategy compared to no case detection
#' @export
test_case_detection <- function(n_sim = 1e+04, p_of_CD=0.1, min_age=40, min_pack_years=0, only_smokers=0, CD_method="CDQ195") {
  message("Comparing a case detection strategy to no case detection.\n")
  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
#  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  input$diagnosis$p_case_detection <- p_of_CD
  input$diagnosis$min_cd_age <- min_age
  input$diagnosis$min_cd_pack_years <- min_pack_years
  input$diagnosis$min_cd_smokers <-only_smokers

  input$diagnosis$logit_p_prevalent_diagnosis_by_sex <- cbind(male=c(intercept=1.0543, age=-0.0152, smoking=0.1068, fev1=-0.6146,
                                                                     cough=0.075, phlegm=0.283, wheeze=-0.0275, dyspnea=0.5414,
                                                                     case_detection=input$diagnosis$case_detection_methods[1,CD_method]),
                                                              female=c(intercept=1.0543-0.1638, age=-0.0152, smoking=0.1068, fev1=-0.6146,
                                                                       cough=0.075, phlegm=0.283, wheeze=-0.0275, dyspnea=0.5414,
                                                                       case_detection=input$diagnosis$case_detection_methods[1,CD_method]))

  input$diagnosis$logit_p_diagnosis_by_sex <- cbind(male=c(intercept=-2, age=-0.0324, smoking=0.3711, fev1=-0.8032,
                                                           gpvisits=0.0087, cough=0.208, phlegm=0.4088, wheeze=0.0321, dyspnea=0.722,
                                                           case_detection=input$diagnosis$case_detection_methods[1,CD_method]),
                                                    female=c(intercept=-2-0.4873, age=-0.0324, smoking=0.3711, fev1=-0.8032,
                                                             gpvisits=0.0087, cough=0.208, phlegm=0.4088, wheeze=0.0321, dyspnea=0.722,
                                                             case_detection=input$diagnosis$case_detection_methods[1,CD_method]))

  input$diagnosis$logit_p_overdiagnosis_by_sex <- cbind(male=c(intercept=-5.2169, age=0.0025, smoking=0.6911, gpvisits=0.0075,
                                                               cough=0.7264, phlegm=0.7956, wheeze=0.66, dyspnea=0.8798,
                                                               case_detection=input$diagnosis$case_detection_methods[2,CD_method]),
                                                        female=c(intercept=-5.2169+0.2597, age=0.0025, smoking=0.6911, gpvisits=0.0075,
                                                                 cough=0.7264, phlegm=0.7956, wheeze=0.66, dyspnea=0.8798,
                                                                 case_detection=input$diagnosis$case_detection_methods[2,CD_method]))
  message("\n")
  message("Here are your inputs for the case detection strategy:\n")
  message("\n")
  message(paste(capture.output(input$diagnosis), collapse = "\n"))

  res <- run(input = input)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs <- get_inputs()
  output <- get_output()
  output_ex <- get_output_ex()

  # Exacerbations
  exac <- output$total_exac
  names(exac) <- c("Mild","Moderate","Severe","VerySevere")
    # rate
  total.gold <- colSums(output_ex$n_COPD_by_ctime_severity[,2:5])
  names(total.gold) <- c("GOLD1","GOLD2","GOLD3","GOLD4")

  exac.gs <- data.frame(output_ex$n_exac_by_gold_severity)
  colnames(exac.gs) <- c("Mild","Moderate","Severe","VerySevere")

  exac_rate <- rbind(GOLD1=exac.gs[1,]/total.gold[1],
                     GOLD2=exac.gs[2,]/total.gold[2],
                     GOLD3=exac.gs[3,]/total.gold[3],
                     GOLD4=exac.gs[4,]/total.gold[4])
  exac_rate$CD <- "Case detection"
  exac_rate$GOLD <- rownames(exac_rate)

  # GOLD
  gold <- data.frame(CD="Case detection",
                         Proportion=colMeans(output_ex$n_COPD_by_ctime_severity/rowSums(output_ex$n_alive_by_ctime_sex)))
  gold$GOLD <- c("NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  terminate_session()

  ## Rerunning with no case detection

  init_session(settings = settings)

  input_nocd <- model_input$values

  input_nocd$diagnosis$p_case_detection <- 0

  message("\n")
  message("Now setting the probability of case detection to", input_nocd$diagnosis$p_case_detection, "and re-running the model\n")
  message("\n")

  res <- run(input = input_nocd)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs_nocd <- get_inputs()
  output_nocd <- get_output()
  output_ex_nocd <- get_output_ex()

  # Exacerbations
  exac_nocd <- output_nocd$total_exac
  names(exac_nocd) <- c("Mild","Moderate","Severe","VerySevere")
    # rate
  total.gold_nocd <- colSums(output_ex_nocd$n_COPD_by_ctime_severity[,2:5])
  names(total.gold_nocd) <- c("GOLD1","GOLD2","GOLD3","GOLD4")

  exac.gs_nocd <- data.frame(output_ex_nocd$n_exac_by_gold_severity)
  colnames(exac.gs_nocd) <- c("Mild","Moderate","Severe","VerySevere")

  exac_rate_nocd <- rbind(GOLD1=exac.gs_nocd[1,]/total.gold_nocd[1],
                     GOLD2=exac.gs_nocd[2,]/total.gold_nocd[2],
                     GOLD3=exac.gs_nocd[3,]/total.gold_nocd[3],
                     GOLD4=exac.gs_nocd[4,]/total.gold_nocd[4])
  exac_rate_nocd$CD <- "No Case detection"
  exac_rate_nocd$GOLD <- rownames(exac_rate_nocd)

  # GOLD
  gold_nocd<- data.frame(CD="No case detection",
                         Proportion=colMeans(output_ex_nocd$n_COPD_by_ctime_severity/rowSums(output_ex_nocd$n_alive_by_ctime_sex)))
  gold_nocd$GOLD <- c("NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4")

  ## Difference between CD and No CD
  # Exacerbations
  exac.diff <- data.frame(cbind(CD=exac, NOCD=exac_nocd))
  exac.diff$Delta <- exac.diff$CD - exac.diff$NOCD

  message("Here are total number of exacerbations by severity:\n")
  message("\n")
  message(paste(capture.output(exac.diff), collapse = "\n"))

  message("\n")
  message("The annual rate of exacerbations with case detection is:\n")
  message(paste(capture.output(exac_rate[,1:4]), collapse = "\n"))
  message("\n")
  message("The annual rate of exacerbations without case detection is:\n")
  message(paste(capture.output(exac_rate_nocd[,1:4]), collapse = "\n"))
  message("\n")
  message("This data is also plotted.\n")

  #plot
  exac.plot <- tidyr::gather(rbind(exac_rate, exac_rate_nocd), key="Exacerbation", value="Rate", Mild:VerySevere)

  exac.plotted <-ggplot2::ggplot(exac.plot, aes(x=Exacerbation, y=Rate, fill=CD)) +
                      geom_bar(stat="identity", position="dodge") + facet_wrap(~GOLD, scales="free_y") +
                      scale_y_continuous(expand = expand_scale(mult=c(0, 0.1))) +
                      xlab("Exacerbation") + ylab("Annual rate of exacerbations") + theme_bw()

  exac.plotted <- exac.plotted + theme(axis.text.x=element_text(angle=45, hjust=1)) +
                    theme(legend.title = element_blank())

  plot(exac.plotted)


  # GOLD
  # plot
  message("\n")
  message("The average proportion of agents in each gold stage is also plotted.\n")

  gold.plot <- rbind(gold, gold_nocd)

  gold.plot$GOLD <- factor(gold.plot$GOLD, levels=c("NoCOPD","GOLD1","GOLD2","GOLD3","GOLD4"))

  gold.plotted <- ggplot2::ggplot(gold.plot, aes(x=GOLD, y=Proportion, fill=CD)) +
                      geom_bar(stat="identity", position="dodge") +
                      scale_y_continuous(expand = c(0,0), limits=c(0,1)) +
                      xlab("GOLD stage") + ylab("Average proportion") + theme_bw()

  gold.plotted <- gold.plotted + theme(legend.title = element_blank())

  plot(gold.plotted)

  message("\n")

  terminate_session()
}


#' Returns results of validation tests for overdiagnosis
#'
#' This function returns the proportion of non-COPD subjects overdiagnosed
#' over model time.
#' @param n_sim number of agents
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results
#' @export
validate_overdiagnosis <- function(n_sim = 1e+04, jurisdiction = "canada") {
  message("Let's take a look at overdiagnosis\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_none"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- 0
  init_session(settings = settings)

  input <- model_input$values

  res <- run(input = input)
  if (res < 0)
    stop("Execution stopped.\n")

  inputs <- get_inputs()
  output_ex <- get_output_ex()

  message("Here are the proportion of non-COPD subjects overdiagnosed over model time: \n")

  overdiag <- data.frame(Year=1:inputs$global_parameters$time_horizon,
                     NonCOPD=output_ex$n_COPD_by_ctime_severity[,1],
                     Overdiagnosed=rowSums(output_ex$n_Overdiagnosed_by_ctime_sex))

  overdiag$Proportion <- overdiag$Overdiagnosed/overdiag$NonCOPD

  message(paste(capture.output(overdiag), collapse = "\n"))

  message("The average proportion overdiagnosed from year", round(length(overdiag$Proportion)/2,0), "to", length(overdiag$Proportion), "is",
      mean(overdiag$Proportion[(round(length(overdiag$Proportion)/2,0)):(length(overdiag$Proportion))]),"\n")

  overdiag.plot <- tidyr::gather(data=overdiag, key="Variable", value="Number", c(NonCOPD, Overdiagnosed))

  overdiag.plotted <- ggplot2::ggplot(overdiag.plot, aes(x=Year, y=Number, col=Variable)) +
    geom_line() + geom_point() + expand_limits(y = 0) +
    theme_bw() + ylab("Number of non-COPD subjects") + xlab("Years")

  plot(overdiag.plotted)

  message("\n")

  terminate_session()

}


#' Returns results of validation tests for medication module.
#'
#' This function returns plots showing medication usage over time
#' @param n_sim number of agents
#' @param jurisdiction character string specifying the jurisdiction ("canada" or "us"). Default is "canada"
#' @return validation test results for medication
#' @export

validate_medication <- function(n_sim = 5e+04, jurisdiction = "canada") {
  message("\n")
  message("Plotting medication usage over time:")
  message("\n")

  # Check jurisdiction
  if (tolower(jurisdiction) != "canada") {
    stop("Validation for jurisdiction '", jurisdiction, "' has not been implemented yet. Currently only 'canada' is supported.")
  }

  petoc()

  settings <- default_settings
  settings$record_mode <- record_mode["record_mode_event"]
  settings$agent_stack_size <- 0
  settings$n_base_agents <- n_sim
  settings$event_stack_size <- settings$n_base_agents * 1.7 * 30
  init_session(settings = settings)

  input <- model_input$values

res <- run(input = input)
if (res < 0)
  stop("Execution stopped.\n")

all_events <- as.data.frame(get_all_events_matrix())
all_annual_events <- all_events[all_events$event==1,] # only annual event

# Prop on each med class over time and by gold
all_annual_events$time <- floor(all_annual_events$local_time + all_annual_events$time_at_creation)

med.plot <- all_annual_events %>%
  group_by(time, gold) %>%
  count(medication_status) %>%
  mutate(prop=n/sum(n))

med.plot$gold <- as.character(med.plot$gold )

# overall among COPD patients
copd <- med.plot %>%
  filter(gold>0) %>%
  group_by(time, medication_status) %>%
  summarise(n=sum(n)) %>%
  mutate(prop=n/sum(n), gold="all copd") %>%
  select(time, gold, everything())

med.plot <- rbind(med.plot, copd)

med.plot$medication_status <- ifelse(med.plot$medication_status==0,"none",
                                     ifelse(med.plot$medication_status==1,"SABA",
                                            ifelse(med.plot$medication_status==4,"LAMA",
                                                   ifelse(med.plot$medication_status==6,"LAMA/LABA",
                                                          ifelse(med.plot$medication_status==14,"ICS/LAMA/LABA",9)))))

med.plotted <- ggplot2::ggplot(data=med.plot, aes(x=time, y=prop, col=medication_status)) +
  geom_line() + facet_wrap(~gold, labeller=label_both) +
  expand_limits(y = 0) + theme_bw() + ylab("Proportion per medication class") + xlab("Years") +
  theme(legend.title=element_blank())

plot(med.plotted)

terminate_session()
}
Any scripts or data that you put into this service are public.
epicR documentation built on March 8, 2026, 5:06 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
epicR
Evaluation Platform in Chronic Obstructive Pulmonary Disease

R/validation.R
In epicR: Evaluation Platform in Chronic Obstructive Pulmonary Disease

Try the epicR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

epicR Evaluation Platform in Chronic Obstructive Pulmonary Disease

R/validation.R In epicR: Evaluation Platform in Chronic Obstructive Pulmonary Disease

Try the epicR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

epicR
Evaluation Platform in Chronic Obstructive Pulmonary Disease

R/validation.R
In epicR: Evaluation Platform in Chronic Obstructive Pulmonary Disease