R/eq5d_devlin.R

Defines functions table_1_3_2 table_1_3_1 table_1_2_4 table_1_2_3 table_1_2_2 table_1_1_3 table_1_2_1 table_1_1_2 table_1_1_1

Documented in table_1_1_1 table_1_1_2 table_1_1_3 table_1_2_1 table_1_2_2 table_1_2_3 table_1_2_4 table_1_3_1 table_1_3_2

#' Table 1.1.1: Frequency of levels by dimensions, cross-sectional
#' 
#' @param df Data frame with the EQ-5D and follow-up columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @return Summary data frame.
#' @examples
#' table_1_1_1(
#'  df = example_data[example_data$time == "Pre-op",],
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'  eq5d_version = "3L"
#' )
#' @export

table_1_1_1<- function(df, 
                      names_eq5d = NULL,
                      eq5d_version = NULL) {
  do.call(.freqtab, list(
    df = df,
    names_eq5d = names_eq5d,
    eq5d_version = eq5d_version,
    add_summary_problems_change = FALSE
  ))
}


#' Table 1.1.2: Frequency of levels by dimensions, separated by category
#' 
#' @param df Data frame with the EQ-5D and follow-up columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_cat Character string for the category  column. If NULL, no grouping is used, and the table reports for the total population, i.e. equal to table 1.1.1.
#' @param levels_cat Character vector containing the order of the values in the category column, if the wish is to have these presented in a particular order. 
#' If NULL (default value), unless the variable is a factor, the levels will be ordered in the order of appearance in df.
#' @param eq5d_version Version of the EQ-5D instrument
#' @return Summary data frame.
#' @examples
#' table_1_1_2(
#'  df = example_data[example_data$time == "Pre-op",],
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'  name_cat = "procedure",
#'  levels_cat = c("Hip Replacement", "Knee Replacement"),
#'  eq5d_version = "3L"
#' )
#' @export
#' 
table_1_1_2<- function(df, 
                       names_eq5d = NULL,
                       name_cat = NULL,
                       levels_cat = NULL,
                       eq5d_version = NULL) {
  tmp <- .freqtab(df, names_eq5d, name_cat, levels_cat, eq5d_version)
  tmp[-(NROW(tmp)-(1:2)),]
}

#' Table 1.2.1: Frequency of levels by dimensions, by follow-up
#' 
#' @param df Data frame with the EQ-5D and follow-up columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column. If NULL, the function will check if there is a column named "follow-up" or "fu", in which case the first of those will be used.
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @param eq5d_version Version of the EQ-5D instrument
#' @return Summary data frame.
#' @examples
#' table_1_2_1(
#'   df = example_data,
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op" , "Post-op"),
#'   eq5d_version = "3L"
#' )
#' @export

table_1_2_1<- function(df, 
                       names_eq5d = NULL,
                       name_fu = NULL,
                       levels_fu = NULL,
                       eq5d_version = NULL) {
  mc <- as.list(match.call()[-1])
  
  if(is.null(name_fu)){
    if("follow-up" %in% colnames(df)) mc$name_fu <- "follow_up"
    if("fu" %in% colnames(df)) mc$name_fu <- "fu"
  }
  do.call(.freqtab, mc)
}

#' Table 1.1.3: Prevalence of the 10 most frequently observed self-reported health states
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param n Number of most frequently observed states to display (default 10)
#' @return Summary data frame
#' @export
#' @examples
#' table_1_1_3(
#'   df = example_data[example_data$time == "Pre-op",],
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   eq5d_version = "3L",
#'   n = 10
#' )

table_1_1_3 <- function(df, 
                      names_eq5d = NULL,
                      eq5d_version = NULL,
                      n = 10) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, 
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- names_eq5d
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d, eq5d_version = eq5d_version, add_state = TRUE)
  df <- df[, "state", drop = FALSE]

  ### analysis ####

  # all states (non-NA only), aggregated and sorted
  df_cc <- df[!is.na(df$state), , drop = FALSE]
  states_agg <- aggregate(rep(1L, nrow(df_cc)), by = list(state = df_cc$state), FUN = sum)
  names(states_agg)[2] <- "n"
  states_agg <- states_agg[order(-states_agg$n), , drop = FALSE]
  states_agg$p <- states_agg$n / sum(states_agg$n)
  states_agg$cum_p <- cumsum(states_agg$p)
  rownames(states_agg) <- NULL

  # most frequent n non-NA states
  states_top <- states_agg[seq_len(min(n, nrow(states_agg))), , drop = FALSE]

  # worst state
  state_worst <- if (eq5d_version == "5L") "55555" else "33333"

  if (!state_worst %in% states_top[, 1]) {
    idx <- match(state_worst, states_agg$state)
    if (!is.na(idx)) {
      states_worst <- states_agg[idx, , drop = FALSE]
      states_worst$cum_p <- 1
    } else {
      states_worst <- states_agg[0, , drop = FALSE]
    }
    states_worst <- states_worst[c(1, 1), , drop = FALSE]
    states_worst[1, ] <- NA
    states_worst[1, 1] <- '...'
  } else {
    states_worst <- states_agg[0, , drop = FALSE]
  }

  # missing data
  n_miss <- sum(is.na(df$state))
  state_na <- data.frame(state = "Missing", n = n_miss,
                         p = n_miss / nrow(df), cum_p = NA_real_,
                         stringsAsFactors = FALSE)

  # combine and tidy up
  retval <- rbind(states_top, states_worst, state_na)
  names(retval) <- c("Health state", "Frequency", "Percentage", "Cumulative percentage")
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.2.2: Changes in health according to the PCHC (Paretian Classification of Health Change)
#' 
#' @param df Data frame with the EQ-5D, grouping, id and follow-up columns
#' @param name_id Character string for the patient id column
#' @param name_groupvar Character string for the grouping column
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @return Summary data frame
#' @export
#' @examples
#' table_1_2_2(
#'   df = example_data,
#'   name_id = "id",
#'   name_groupvar = "procedure",
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op" , "Post-op")
#' )

table_1_2_2 <- function(df,
                      name_id,
                      name_groupvar,
                      names_eq5d = NULL,
                      name_fu = NULL, 
                      levels_fu = NULL) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, 
                     names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  # check existence of columns 
  names_all <- c(name_groupvar, name_id, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_id] <- "id"

  names(df)[names(df) == name_groupvar] <- "groupvar"
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  # sort by id - groupvar - time

  df <- df[order(df$id, df$groupvar, df$fu), , drop = FALSE]
  # check uniqueness of id-groupvar-fu combinations
  .check_uniqueness(df, group_by = c("id", "groupvar", "fu"))
  
  ### analysis ###
  
  # calculate change
  df <- .pchc(df = df, level_fu_1 = levels_fu[1])
  df <- df[!is.na(df$state), , drop = FALSE]
  df$state <- factor(df$state, levels = c("No change", "Improve", "Worsen", "Mixed change"))

  # summarise by groupvar, fu & state
  summary_dim <- aggregate(rep(1L, nrow(df)),
                           by = list(groupvar = df$groupvar, fu = df$fu, state = df$state),
                           FUN = sum)
  names(summary_dim)[names(summary_dim) == "x"] <- "n"
  key_gf <- paste(summary_dim$groupvar, summary_dim$fu, sep = "\001")
  summary_dim$p <- summary_dim$n / tapply(summary_dim$n, key_gf, sum)[key_gf]

  # summarise totals
  agg_tot <- aggregate(cbind(n = summary_dim$n, p = summary_dim$p),
                       by = list(groupvar = summary_dim$groupvar, fu = summary_dim$fu),
                       FUN = sum)
  agg_tot$state <- "Grand Total"
  summary_total <- agg_tot

  # combine & tidy up: pivot n/p to wide columns {groupvar}_{fu}_n / {groupvar}_{fu}_p
  combined <- rbind(summary_dim, summary_total)
  state_order <- c(levels(df$state), "Grand Total")
  all_states <- state_order[state_order %in% unique(as.character(combined$state))]
  retval <- data.frame(state = all_states, stringsAsFactors = FALSE)
  grp_vals <- unique(as.character(combined$groupvar))
  fu_vals  <- as.character(levels_fu[-1])
  for (gv in grp_vals) {
    for (f in fu_vals) {
      sub <- combined[combined$groupvar == gv & as.character(combined$fu) == f, , drop = FALSE]
      vals_n <- setNames(sub$n, as.character(sub$state))
      vals_p <- setNames(sub$p, as.character(sub$state))
      retval[[paste(gv, f, "n", sep = "_")]] <- ifelse(is.na(vals_n[retval$state]), 0, vals_n[retval$state])
      retval[[paste(gv, f, "p", sep = "_")]] <- ifelse(is.na(vals_p[retval$state]), 0, vals_p[retval$state])
    }
  }
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.2.3: Changes in health according to the PCHC, taking account of those with no problems
#' 
#' @param df Data frame with the EQ-5D, grouping, id and follow-up columns
#' @param name_id Character string for the patient id column
#' @param name_groupvar Character string for the grouping column
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @return Summary data frame
#' @export
#' @examples
#' table_1_2_3(
#'   df = example_data,
#'   name_id = "id",
#'   name_groupvar = "procedure",
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op" , "Post-op")
#' )

table_1_2_3 <- function(df, 
                      name_id,
                      name_groupvar,
                      names_eq5d = NULL,
                      name_fu = NULL,
                      levels_fu = NULL) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, 
                     names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  # check existence of columns 
  names_all <- c(name_groupvar, name_id, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_id] <- "id"

  names(df)[names(df) == name_groupvar] <- "groupvar"
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  # sort by id - groupvar - time
  df <- df[order(df$id, df$groupvar, df$fu), , drop = FALSE]
  # check uniqueness of id-groupvar-fu combinations
  .check_uniqueness(df, group_by = c("id", "groupvar", "fu"))
  
  ### analysis ###
  
  # calculate change
  df <- .pchc(df = df, level_fu_1 = levels_fu[1], add_noprobs = TRUE)
  df <- df[!is.na(df$state), , drop = FALSE]

  # separate out those with problems & calculate percentages
  df_status <- df
  df_status$state_noprobs <- ifelse(df_status$state_noprobs == "No problems",
                                    "No problems", "Total with problems")
  summary_by_probs_status <- aggregate(rep(1L, nrow(df_status)),
    by = list(groupvar = df_status$groupvar, fu = df_status$fu,
              state_noprobs = df_status$state_noprobs), FUN = sum)
  names(summary_by_probs_status)[names(summary_by_probs_status) == "x"] <- "n"
  key_bp <- paste(summary_by_probs_status$groupvar, summary_by_probs_status$fu, sep = "\001")
  summary_by_probs_status$p <- summary_by_probs_status$n /
    tapply(summary_by_probs_status$n, key_bp, sum)[key_bp]

  # summarise classes within those with problems
  df_probs <- df[df$state_noprobs != "No problems", , drop = FALSE]
  summary_with_probs <- aggregate(rep(1L, nrow(df_probs)),
    by = list(groupvar = df_probs$groupvar, fu = df_probs$fu,
              state_noprobs = df_probs$state_noprobs), FUN = sum)
  names(summary_with_probs)[names(summary_with_probs) == "x"] <- "n"
  key_wp <- paste(summary_with_probs$groupvar, summary_with_probs$fu, sep = "\001")
  summary_with_probs$p <- summary_with_probs$n /
    tapply(summary_with_probs$n, key_wp, sum)[key_wp]

  # combine & tidy up
  combined <- rbind(summary_with_probs, summary_by_probs_status)
  state_order <- c("No change", "Improve", "Worsen", "Mixed change",
                   "Total with problems", "No problems")
  combined$state_noprobs <- factor(combined$state_noprobs, levels = state_order)
  combined <- combined[order(combined$state_noprobs), , drop = FALSE]
  all_states <- levels(combined$state_noprobs)[levels(combined$state_noprobs) %in%
                                                unique(as.character(combined$state_noprobs))]
  retval <- data.frame(state_noprobs = all_states, stringsAsFactors = FALSE)
  grp_vals <- unique(as.character(combined$groupvar))
  fu_vals  <- as.character(levels_fu[-1])
  for (gv in grp_vals) {
    for (f in fu_vals) {
      sub <- combined[combined$groupvar == gv & as.character(combined$fu) == f, , drop = FALSE]
      vals_n <- setNames(sub$n, as.character(sub$state_noprobs))
      vals_p <- setNames(sub$p, as.character(sub$state_noprobs))
      retval[[paste(gv, f, "n", sep = "_")]] <- ifelse(is.na(vals_n[retval$state_noprobs]), 0, vals_n[retval$state_noprobs])
      retval[[paste(gv, f, "p", sep = "_")]] <- ifelse(is.na(vals_p[retval$state_noprobs]), 0, vals_p[retval$state_noprobs])
    }
  }
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.2.4: Changes in levels in each dimension, percentages of total and of type of change
#' 
#' @param df Data frame with the EQ-5D, id and follow-up columns
#' @param name_id Character string for the patient id column
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @return Summary data frame
#' @export
#' @examples
#' table_1_2_4(
#'   df = example_data,
#'   name_id = "id",
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op" , "Post-op")
#' )

table_1_2_4 <- function(df, 
                      name_id,
                      names_eq5d = NULL,
                      name_fu = NULL,
                      levels_fu = NULL) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, 
                     names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  # check existence of columns 
  names_all <- c(name_id, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_id] <- "id"
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  # sort by id - time

  df <- df[order(df$id, df$fu), , drop = FALSE]
  # check uniqueness of id-fu combinations
  .check_uniqueness(df, group_by = c("id", "fu"))
  
  ### analysis ###
  
  # calculate change
  levels_eq5d <- c("mo", "sc", "ua", "pd", "ad")
  level_fu_1 <- levels_fu[1]

  # reshape to long format
  df_long <- do.call(rbind, lapply(levels_eq5d, function(d) {
    data.frame(domain = d, id = df$id, fu = df$fu, value = df[[d]],
               stringsAsFactors = FALSE)
  }))
  # sort by domain, id, fu
  df_long <- df_long[order(df_long$domain, df_long$id, df_long$fu), , drop = FALSE]
  rownames(df_long) <- NULL

  # compute lagged difference (dplyr::lag equivalent)
  prev_val <- c(NA_real_, head(df_long$value, -1))
  df_long$diff <- prev_val - df_long$value
  df_long$level_change <- ifelse(is.na(prev_val), NA_character_,
                                 paste0(prev_val, "-", df_long$value))
  # set baseline to NA
  df_long$diff[as.character(df_long$fu) == as.character(level_fu_1)] <- NA_real_
  df_long$level_change[as.character(df_long$fu) == as.character(level_fu_1)] <- NA_character_

  # remove NAs (baseline rows) and classify difference
  df_diff <- df_long[!is.na(df_long$diff), , drop = FALSE]
  df_diff$diff <- ifelse(df_diff$diff > 0, "Better",
                   ifelse(df_diff$diff < 0, "Worse", "No change"))
  df_diff$domain <- factor(df_diff$domain, levels = levels_eq5d)
  df_diff <- df_diff[order(df_diff$domain), , drop = FALSE]

  # % total: n per (domain, diff, level_change), p = n / sum(n) within domain
  p_agg <- aggregate(rep(1L, nrow(df_diff)),
                     by = list(domain = df_diff$domain, diff = df_diff$diff,
                                level_change = df_diff$level_change), FUN = sum)
  names(p_agg)[names(p_agg) == "x"] <- "n"
  p_total <- p_agg
  p_total$p <- p_total$n / tapply(p_total$n, p_total$domain, sum)[p_total$domain]
  p_total$type <- "% Total"

  # % type: p = n / sum(n) within (domain, diff)
  p_type <- p_agg
  key_dt <- paste(p_type$domain, p_type$diff, sep = "\001")
  p_type$p <- p_type$n / tapply(p_type$n, key_dt, sum)[key_dt]
  p_type$type <- "% Type"

  # combine and build wide output
  combined_p <- rbind(p_total[, c("domain", "type", "diff", "level_change", "p")],
                      p_type[,  c("domain", "type", "diff", "level_change", "p")])

  # ordered (diff, level_change) row keys
  all_rows <- unique(combined_p[, c("diff", "level_change")])
  all_rows$diff <- factor(all_rows$diff, levels = c("No change", "Better", "Worse"))
  all_rows <- all_rows[order(all_rows$diff, all_rows$level_change), , drop = FALSE]
  rownames(all_rows) <- NULL
  retval <- all_rows

  # column order: {domain}_{type} as produced by expand_grid(levels_eq5d, c("% Total","% Type"))
  col_names <- as.vector(outer(levels_eq5d, c("% Total", "% Type"), paste, sep = "_"))
  for (cn in col_names) {
    sep_idx <- regexpr("_% ", cn)
    dom  <- substr(cn, 1, sep_idx - 1)
    type <- substr(cn, sep_idx + 1, nchar(cn))
    sub  <- combined_p[as.character(combined_p$domain) == dom & combined_p$type == type, , drop = FALSE]
    vals <- setNames(sub$p, paste(sub$diff, sub$level_change, sep = "\001"))
    row_key <- paste(as.character(retval$diff), retval$level_change, sep = "\001")
    retval[[cn]] <- ifelse(is.na(vals[row_key]), 0, vals[row_key])
  }
  retval <- retval[, c("diff", "level_change", col_names), drop = FALSE]
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.3.1: Summary statistics for the EQ-5D values by all the different LSSs (Level Sum Scores)
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary data frame
#' @export
#' @examples
#' df <- data.frame(make_all_EQ_states(version = "5L"))
#' table_1_3_1(
#'   df, 
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'   eq5d_version = "3L", 
#'   country = "US"
#' )

table_1_3_1 <- function(df, 
                      names_eq5d = NULL,
                      eq5d_version = NULL,
                      country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_lss = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, c("lss", "utility"), drop = FALSE]

  ### analysis ###

  # summarise non-NA values
  df_cc <- df[!is.na(df$lss), , drop = FALSE]
  grp_key <- df_cc$lss
  parts <- split(df_cc$utility, grp_key)
  summary_list <- lapply(sort(unique(grp_key)), function(k) {
    u <- parts[[as.character(k)]]
    data.frame(lss = k, Number = length(u), Mean = mean(u),
               `Standard Deviation` = sd(u), Median = median(u),
               Minimum = min(u), Maximum = max(u), Range = max(u) - min(u),
               check.names = FALSE, stringsAsFactors = FALSE)
  })
  summary <- do.call(rbind, summary_list)

  # missing data
  summary_na <- data.frame(lss = "Missing", Number = sum(is.na(df$lss)),
                            Mean = NA_real_, `Standard Deviation` = NA_real_,
                            Median = NA_real_, Minimum = NA_real_,
                            Maximum = NA_real_, Range = NA_real_,
                            check.names = FALSE, stringsAsFactors = FALSE)

  # combine and tidy up
  summary$lss <- as.character(summary$lss)
  retval <- rbind(summary, summary_na)
  names(retval)[names(retval) == "lss"] <- "LSS"
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.3.2: Distribution of the EQ-5D states by LFS (Level Frequency Score)
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @return Summary data frame
#' @export
#' @examples
#' table_1_3_2(
#'   example_data, 
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'   eq5d_version = "3L"
#' )

table_1_3_2 <- function(df, 
                      names_eq5d = NULL, 
                      eq5d_version = NULL){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_lfs = TRUE, eq5d_version = eq5d_version)
  df <- df[, "lfs", drop = FALSE]

  ### analysis ###

  # summarise non-NA values
  df_cc <- df[!is.na(df$lfs), , drop = FALSE]
  counts <- aggregate(rep(1L, nrow(df_cc)), by = list(lfs = df_cc$lfs), FUN = sum)
  names(counts)[2] <- "n"
  counts <- counts[order(-counts$n), , drop = FALSE]
  counts$p <- counts$n / sum(counts$n)
  counts$cum_p <- cumsum(counts$p)
  names(counts) <- c("LFS", "Frequency", "%", "Cum (%)")

  # missing data
  summary_na <- data.frame(LFS = "Missing", Frequency = sum(is.na(df$lfs)),
                            `%` = NA_real_, `Cum (%)` = NA_real_,
                            check.names = FALSE, stringsAsFactors = FALSE)

  # combine
  retval <- rbind(counts, summary_na)
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.3.3: Number of observations in the LFS (Level Frequency Score) according to the EQ-5D values
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary data frame
#' @export
#' @examples
#' table_1_3_3(
#'   example_data, 
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'   eq5d_version = "3L",
#'   country = "UK"
#' )

table_1_3_3 <- function(df, 
                      names_eq5d = NULL,
                      eq5d_version = NULL,
                      country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_lfs = TRUE, eq5d_version = eq5d_version,
                   add_utility = TRUE, country = country)
  df <- df[, c("lfs", "utility"), drop = FALSE]

  ### analysis ###

  # exclude NAs and count per (utility, lfs) combination
  df_cc <- df[!is.na(df$lfs), , drop = FALSE]
  counts <- aggregate(rep(1L, nrow(df_cc)),
                      by = list(utility = df_cc$utility, lfs = df_cc$lfs), FUN = sum)
  names(counts)[3] <- "n"

  # all lfs levels (sorted)
  all_lfs <- sort(unique(as.character(df_cc$lfs)))
  all_utils <- sort(unique(df_cc$utility))

  # build wide matrix: rows = utility, cols = lfs
  retval <- data.frame(utility = all_utils, stringsAsFactors = FALSE)
  for (lf in all_lfs) {
    sub <- counts[as.character(counts$lfs) == lf, , drop = FALSE]
    vals <- setNames(sub$n, sub$utility)
    retval[[lf]] <- ifelse(is.na(vals[as.character(retval$utility)]), 0L,
                           vals[as.character(retval$utility)])
  }
  # add row totals
  retval$Total <- rowSums(retval[, all_lfs, drop = FALSE])
  # tidy up
  names(retval)[names(retval) == "utility"] <- "EQ-5D Value"
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 1.3.4: Summary statistics of EQ-5D values by LFS (Level Frequency Score)
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary data frame
#' @export
#' @examples
#' table_1_3_4(
#'   example_data, 
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'   eq5d_version = "3L",
#'   country = "UK"
#' )

table_1_3_4 <- function(df, 
                       names_eq5d = NULL,
                       eq5d_version = NULL,
                       country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d, 
                   add_state = TRUE, 
                   add_lfs = TRUE, eq5d_version = eq5d_version,
                   add_utility = TRUE, country = country) 
  
  ### analysis ###
  
  # summarise non-NA values
  df_cc <- df[!is.na(df$utility), , drop = FALSE]
  parts <- split(df_cc$utility, df_cc$lfs)
  result_list <- lapply(sort(names(parts)), function(k) {
    u <- parts[[k]]
    data.frame(LFS = k, Frequency = length(u), Mean = mean(u), SD = sd(u),
               Median = median(u), Minimum = min(u), Maximum = max(u),
               Range = max(u) - min(u), stringsAsFactors = FALSE)
  })
  retval <- do.call(rbind, result_list)
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Figure 1.4.1: Generate a Health State Density Curve (HSDC) for EQ-5D Data
#'
#' This function calculates and plots the Health State Density Curve (HSDC) for a given
#' EQ-5D dataset. It concatenates dimension values to form health state profiles, filters
#' out invalid states based on the specified EQ-5D version, then computes the cumulative
#' distribution of profiles (profiles vs. observations). A diagonal reference line
#' indicates a perfectly even distribution. The function also calculates the Health State
#' Density Index (HSDI), representing how sharply the observed distribution deviates from
#' the diagonal.
#'
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @return A list containing:
#'   \item{plot_data}{A data frame with the cumulative distribution of profiles}
#'   \item{p}{A ggplot2 object showing the Health State Density Index}
#' @export
#' @examples
#' figure <- figure_1_4_1(
#'             df = example_data, 
#'             names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'             eq5d_version = "3L"
#'           )
#' figure$plot_data
#' figure$p
#' @importFrom utils head

figure_1_4_1 <- function(df, names_eq5d, eq5d_version) {
  
  # Retrieve validated names & version from helper function
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d   <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  
  # Check existence of columns
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df))) {
    stop("Provided column names not in the data frame. Stopping.")
  }
  
  # Subset to relevant columns
  df <- df[, names_all, drop = FALSE]

  # Prepare data (adds 'state' column via .prep_eq5d)
  df <- .prep_eq5d(
    df           = df,
    names        = names_eq5d,
    add_state    = TRUE,
    add_lfs      = FALSE,
    eq5d_version = eq5d_version
  )

  # Check for NA in 'state', show warning and filter
  n_na <- sum(is.na(df$state))
  if (n_na > 0) {
    warning(n_na, " excluded observations with invalid EQ-5D profiles.")
    df <- df[!is.na(df$state), , drop = FALSE]
  }

  # Calculate frequencies & cumulative proportions
  freq_agg <- aggregate(rep(1L, nrow(df)), by = list(state = df$state), FUN = sum)
  names(freq_agg)[2] <- "Frequency"
  freq_agg <- freq_agg[order(-freq_agg$Frequency), , drop = FALSE]
  rownames(freq_agg) <- NULL
  freq_agg$CumPropObservations <- cumsum(freq_agg$Frequency) / sum(freq_agg$Frequency)
  freq_agg$CumPropStates       <- seq_len(nrow(freq_agg)) / nrow(freq_agg)
  profile_freq <- freq_agg
  
  # Compute the Health State Density Index (HSDI)
  hsdi_area <- sum(
    diff(c(0, profile_freq$CumPropObservations)) * 
      (head(c(0, profile_freq$CumPropStates), -1) + profile_freq$CumPropStates) / 2
  )
  hsdi <- round(2 * hsdi_area, 3)
  
  # Plot Health State Density Curve
  p <- ggplot2::ggplot(profile_freq, ggplot2::aes(
    x = .data$CumPropObservations, 
    y = .data$CumPropStates
  )) +
    ggplot2::geom_line(color = "blue", linewidth = 1.2) +
    ggplot2::geom_abline(
      intercept = 0, 
      slope = 1, 
      linetype = "solid", 
      linewidth = 1.2, 
      color = "orange"
    ) +
    ggplot2::labs(
      title    = paste0("Health State Density Curve (HSDC)"),
      subtitle = paste0("HSDI = ", hsdi),
      x        = "Cumulative proportion of observations",
      y        = "Cumulative proportion of profiles"
    ) +
    # Force the aspect ratio to 1:1
    ggplot2::coord_fixed(ratio = 1) +
    # Limit x,y range from 0 to 1
    ggplot2::scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
    ggplot2::scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
    ggplot2::theme_minimal()
  
  return(list(plot_data = profile_freq, p = p))
}

#' Table 2.1: EQ VAS Score by timepoints
#' 
#' @param df Data frame with the VAS and the follow-up columns
#' @param name_vas Character string for the VAS column
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' @return Summary data frame
#' @export
#' @examples
#' table_2_1(
#'   example_data,
#'   name_vas = 'vas', 
#'   name_fu = 'time', 
#'   levels_fu = c('Pre-op', 'Post-op')
#' )

table_2_1 <- function(df, 
                      name_vas = NULL,
                      name_fu = NULL,
                      levels_fu = NULL){
  
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, name_fu = name_fu, levels_fu = levels_fu, name_vas = name_vas)
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  name_vas <- temp$name_vas
  # check existence of columns 
  names_all <- c(name_vas, name_fu)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_vas(df = df, name = name_vas)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  
  ### analysis ###
  
  # values calculated in the .summary_cts_by_fu wrapper
  retval <- .summary_cts_by_fu(df = df, name_v = "vas")
  
  # return value
  return(retval)
}

#' Table 2.2: EQ VAS Scores frequency of mid-points
#' 
#' @param df Data frame with the VAS column
#' @param name_vas Character string for the VAS column
#' @param add_na_total Logical, whether to add summary of the missing, and across the Total, data
#' @return Summary data frame
#' @export
#' @examples
#' table_2_2(
#'   example_data,
#'   name_vas = 'vas', 
#'   add_na_total =  TRUE
#'   )

table_2_2 <- function(df, 
                      name_vas = NULL, 
                      add_na_total = TRUE){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(name_vas = name_vas)
  name_vas <- temp$name_vas
  # check existence of columns 
  names_all <- c(name_vas)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_vas(df = df, name = name_vas)
  
  ### analysis ###
  
  # define ranges and midpoints
  l_vals <- c(0:2, seq(from = 3, to = 93, by = 5), 98:100)
  u_vals <- c(0:2, seq(from = 7, to = 97, by = 5), 98:100)
  mid_vals <- c(0:2, seq(from = 5, to = 95, by = 5), 98:100)
  retval <- data.frame(l = l_vals, u = u_vals, Midpoint = mid_vals, Frequency = NA_integer_)

  # populate the table
  for (i in seq_len(nrow(retval))) {
    retval[i, "Frequency"] <- sum(!is.na(df$vas) & df$vas >= retval$l[i] & df$vas <= retval$u[i])
  }

  # add range column and tidy up
  retval$Range <- ifelse(retval$l == retval$u, as.character(retval$l),
                         paste(retval$l, retval$u, sep = "-"))
  retval <- retval[, c("Range", "Midpoint", "Frequency")]

  # add totals & missing data if needed
  if (add_na_total) {
    n_total <- nrow(df)
    n_na    <- sum(is.na(df$vas))
    retval  <- rbind(retval,
                     data.frame(Range = c("Total observed", "Missing", "Total sample"),
                                Midpoint = rep(NA_real_, 3),
                                Frequency = c(n_total - n_na, n_na, n_total)))
  }
  
  # return value
  return(retval)
}

#' Table 3.1: EQ-5D values: by timepoints
#' 
#' @param df Data frame with the EQ-5D and follow-up columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary data frame
#' @export
#' @examples
#' table_3_1(
#'   example_data,
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'   name_fu = "time",
#'   levels_fu = c('Pre-op', 'Post-op'),
#'   eq5d_version = "3L",
#'   country = "UK"
#' )

table_3_1 <- function(df, 
                      names_eq5d = NULL,
                      name_fu = NULL,
                      levels_fu = NULL,
                      eq5d_version = NULL,
                      country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, 
                     names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu,
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  df <- df[, c("fu", "utility"), drop = FALSE]

  ### analysis ###

  # summarise
  retval <- .summary_cts_by_fu(df = df, name_v = "utility")
  
  # return value
  return(retval)
}

#' Table 3.2 EQ-5D values: by groupvar
#' 
#' @param df Data frame with the EQ-5D, follow-up and grouping columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_groupvar Character string for the grouping column
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' @return Summary data frame
#' @export
#' @examples
#' table_3_2(
#'   example_data,
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_groupvar = "procedure",
#'   eq5d_version = "3L",
#'   country = "UK"
#' )

table_3_2 <- function(df,
                      names_eq5d = NULL,
                      name_groupvar,
                      eq5d_version = NULL,
                      country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, 
                     names_eq5d = names_eq5d, 
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d, name_groupvar)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_groupvar] <- "groupvar"
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, c("groupvar", "utility"), drop = FALSE]

  ### analysis ###

  # summary by category
  summary_by <- .summary_table_4_3(df = df, group_by = "groupvar")
  # summary_total (all groups combined)
  u_all <- df$utility
  summary_total <- data.frame(
    groupvar = "All groups",
    Mean = mean(u_all, na.rm = TRUE),
    `Standard error` = sd(u_all, na.rm = TRUE) / sqrt(sum(!is.na(u_all))),
    Median = median(u_all, na.rm = TRUE),
    `25th` = quantile(u_all, probs = 0.25, na.rm = TRUE, names = FALSE),
    `75th` = quantile(u_all, probs = 0.75, na.rm = TRUE, names = FALSE),
    N = sum(!is.na(u_all)),
    Missing = sum(is.na(u_all)),
    check.names = FALSE, stringsAsFactors = FALSE
  )

  # combine: pivot longer then wider (rows = stat names, cols = groups)
  combined <- rbind(summary_by, summary_total)
  stat_cols <- setdiff(names(combined), "groupvar")
  grp_vals  <- unique(combined$groupvar)
  retval <- data.frame(name = stat_cols, stringsAsFactors = FALSE)
  for (gv in grp_vals) {
    sub <- combined[combined$groupvar == gv, stat_cols, drop = FALSE]
    retval[[gv]] <- as.numeric(sub[1, ])
  }
  rownames(retval) <- NULL

  # return value
  return(retval)
}

#' Table 3.3 EQ-5D values: by age and groupvar
#' 
#' @param df Data frame with the EQ-5D, age, follow-up and grouping columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @param name_groupvar Character string for the grouping column
#' @param name_age Character string for the age column
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary data frame
#' @export
#' @examples
#' example_data$ageband <- factor(
#'   example_data$ageband,
#'   levels = c("20 to 29", "30 to 39", "40 to 49", "50 to 59", "60 to 69", "70 to 79", "80 to 89")
#' )
#' example_data <- example_data[example_data$gender %in% c("Male", "Female"),]
#' table_3_3(
#'   example_data,
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c('Pre-op', 'Post-op'),
#'   name_groupvar = "gender",
#'   name_age = "ageband",
#'   eq5d_version = "3L",
#'   country = "UK"
#' )

table_3_3 <- function(df,
                      names_eq5d = NULL,
                      name_fu = NULL,
                      levels_fu = NULL,
                      name_groupvar,
                      name_age,
                      eq5d_version = NULL,
                      country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df,
                     names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu,
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d, name_fu, name_groupvar, name_age)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_groupvar] <- "groupvar"
  names(df)[names(df) == name_age]      <- "age"
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  df <- df[, c("groupvar", "age", "fu", "utility"), drop = FALSE]

  # split age into categories
  if (is.numeric(df$age)) {
    age_breaks <- c(0, 18, seq(from = 25, to = 75, by = 10), Inf)
    df$age_cat <- cut(df$age, breaks = age_breaks, right = FALSE)
    df$age_cat <- factor(df$age_cat,
      levels = c("[0,18)", "[18,25)", "[25,35)", "[35,45)", "[45,55)", "[55,65)", "[65,75)", "[75,Inf)"),
      labels = c("0-17", "18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75+"))
    df$age <- NULL
  } else if (is.factor(df$age)) {
    names(df)[names(df) == "age"] <- "age_cat"
  } else {
    stop("The 'age' column must be either numeric or factor. Stopping.")
  }

  ### analysis ###

  # helper for overall summary without grouping
  .total_summary_4_4 <- function(u) {
    data.frame(
      Mean = mean(u, na.rm = TRUE),
      `Standard error` = sd(u, na.rm = TRUE) / sqrt(sum(!is.na(u))),
      `25th Percentile` = quantile(u, probs = 0.25, na.rm = TRUE, names = FALSE),
      `50th Percentile (median)` = median(u, na.rm = TRUE),
      `75th Percentile` = quantile(u, probs = 0.75, na.rm = TRUE, names = FALSE),
      n = sum(!is.na(u)),
      Missing = sum(is.na(u)),
      check.names = FALSE, stringsAsFactors = FALSE
    )
  }

  # summary by category
  summary_by <- .summary_table_4_4(df = df, group_by = c("groupvar", "age_cat"))
  # summary of totals by groupvar (age_cat = "Total")
  summary_total_age <- .summary_table_4_4(df = df, group_by = "groupvar")
  summary_total_age$age_cat <- "Total"
  # summary of totals by age_cat (groupvar = "Total")
  summary_total_by <- .summary_table_4_4(df = df, group_by = "age_cat")
  summary_total_by$groupvar <- "Total"
  # overall total
  st <- .total_summary_4_4(df$utility)
  st$groupvar <- "Total"
  st$age_cat  <- "Total"
  summary_total_by_age <- st

  # combine all
  combined <- rbind(summary_by, summary_total_age, summary_total_by, summary_total_by_age)

  # pivot: rows = (groupvar, stat_name), cols = age_cat values
  stat_cols <- setdiff(names(combined), c("groupvar", "age_cat"))
  all_gc <- unique(combined[, c("groupvar", "age_cat"), drop = FALSE])
  grp_vals <- unique(combined$groupvar)
  age_vals <- unique(combined$age_cat)

  # build result: for each (groupvar, stat), create row; cols = age_cat
  result_list <- lapply(grp_vals, function(gv) {
    sub <- combined[combined$groupvar == gv, , drop = FALSE]
    do.call(rbind, lapply(stat_cols, function(sc) {
      row <- data.frame(groupvar = gv, name = sc, stringsAsFactors = FALSE)
      for (ac in age_vals) {
        s2 <- sub[sub$age_cat == ac, sc, drop = TRUE]
        row[[ac]] <- if (length(s2) == 1) s2 else NA_real_
      }
      row
    }))
  })
  retval <- do.call(rbind, result_list)
  rownames(retval) <- NULL

  # return value
  return(retval)
}


#' Figure 1.2.1: Paretian Classification of Health Change (PCHC) by Group
#' This function computes PCHC categories between two time points for each subject, 
#' stratifies them by a grouping variable, and produces
#' a single bar chart with side-by-side bars showing the distribution of PCHC categories.
#' @param df Data frame containing EQ-5D dimensions, a grouping variable, patient ID, and follow-up columns
#' @param name_id Character string for the patient ID column
#' @param name_groupvar Character string for the grouping column (e.g., procedure)
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector of length 2 indicating the order of follow-up time points (e.g., c("Pre-op", "Post-op"))
#' @return A list with two elements:
#'   \item{plot_data}{A tibble of PCHC percentages by group}
#'   \item{p}{A ggplot2 object showing a bar chart with side-by-side bars for each PCHC category}
#' @export
#' @examples
#' result <- figure_1_2_1(
#'   df = example_data,
#'   name_id = "id",
#'   name_groupvar = "procedure",
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op", "Post-op")
#' )
#' result$p        # shows the plot
#' result$plot_data  # shows the summary table

figure_1_2_1 <- function(df,
                         name_id,
                         name_groupvar,
                         names_eq5d = NULL,
                         name_fu = NULL,
                         levels_fu = NULL) {
  ### 1) Data Preparation ###

  # Replace NULL names with defaults (helper function that sets names_eq5d, name_fu, etc.)
  temp <- .get_names(df = df,
                     names_eq5d = names_eq5d,
                     name_fu = name_fu,
                     levels_fu = levels_fu)
  names_eq5d <- temp$names_eq5d
  name_fu    <- temp$name_fu
  levels_fu  <- temp$levels_fu

  # Check columns exist
  names_all <- c(name_id, name_groupvar, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df))) {
    stop("Provided column names not in dataframe. Stopping.")
  }

  # Keep only relevant columns
  df <- df[, names_all, drop = FALSE]

  # Rename for internal use
  names(df)[names(df) == name_id]       <- "id"
  names(df)[names(df) == name_groupvar] <- "groupvar"

  # Prepare EQ-5D & Follow-up columns
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)

  # Sort by (id, groupvar, fu)
  df <- df[order(df$id, df$groupvar, df$fu), , drop = FALSE]

  # Check uniqueness: each (id, groupvar, fu) should appear at most once
  .check_uniqueness(df, group_by = c("id", "groupvar", "fu"))

  ### 2) Calculate PCHC (Paretian Classification) ###
  # .pchc() compares the first level in levels_fu (e.g. "Pre-op") vs. the second (e.g. "Post-op")
  df <- .pchc(df = df, level_fu_1 = levels_fu[1], add_noprobs = TRUE)
  df <- df[!is.na(df$state), , drop = FALSE]
  # The factor levels for PCHC categories can be: "No problems", "No change", "Improve", "Worsen", "Mixed change"
  df$state_noprobs <- factor(
    df$state_noprobs,
    levels = c("No problems", "No change", "Improve", "Worsen", "Mixed change")
  )

  ### 3) Summarize for Single Bar Chart ###
  # We want one bar per groupvar for each state_noprobs
  plot_data <- aggregate(rep(1L, nrow(df)),
                         by = list(groupvar = df$groupvar, state_noprobs = df$state_noprobs),
                         FUN = sum)
  names(plot_data)[names(plot_data) == "x"] <- "n"
  key_gv <- plot_data$groupvar
  plot_data$p <- plot_data$n / tapply(plot_data$n, key_gv, sum)[key_gv]

  ### 4) Build the Plot ###
  p <- ggplot2::ggplot(plot_data,
                       ggplot2::aes(x = .data$state_noprobs,
                                    y = .data$p,
                                    fill = .data$groupvar)) +
    ggplot2::geom_bar(stat = "identity", position = ggplot2::position_dodge(width = 0.9)) +
    ggplot2::geom_text(
      ggplot2::aes(label = scales::percent(.data$p, accuracy = 0.1)),
      position = ggplot2::position_dodge(width = 0.9),
      vjust = -0.5
    ) +
    ggplot2::scale_x_discrete(name = "Pareto classification") +
    ggplot2::scale_y_continuous(
      name    = "Percentage of respondents",
      labels  = scales::percent_format(accuracy = 1),
      expand  = ggplot2::expansion(mult = c(0, 0.1))
    ) +
    ggplot2::labs(fill = "Group") +
    ggplot2::ggtitle("Paretian Classification of Health Change (PCHC) by group over time") +
    ggplot2::scale_fill_brewer(palette = "Blues") +
    ggplot2::theme_minimal(base_size = 14)

  ### 5) Reshape Plot Data for Return ###
  # Convert from long to wide so each group appears as a column
  plot_data_wide <- stats::reshape(
    plot_data[, c("state_noprobs", "groupvar", "p")],
    idvar     = "state_noprobs",
    timevar   = "groupvar",
    direction = "wide"
  )
  names(plot_data_wide) <- sub("^p\\.", "", names(plot_data_wide))
  plot_data_wide[is.na(plot_data_wide)] <- 0
  names(plot_data_wide)[names(plot_data_wide) == "state_noprobs"] <- "Change category"

  ### 6) Return ###
  # Return list with summarized data & the ggplot object (with optional theme modifications)
  return(list(
    plot_data = plot_data_wide,
    p = .modify_ggplot_theme(p = p)  # if .modify_ggplot_theme is available
  ))
}

#' Figure 1.2.2: Percentage of Respondents Who Improved in Each EQ-5D Dimension, by Group
#' This function calculates how many respondents improved in each dimension between
#' two time points and summarizes the results for each group. The, it prodcuces 
#' a dimension-focused chart illustrating improvement percentages by dimension.
#' @param df Data frame containing EQ-5D columns, a grouping variable, an ID column, and a follow-up column
#' @param name_id Character string for the patient ID column
#' @param name_groupvar Character string for the grouping column (e.g. procedure)
#' @param names_eq5d Character vector of EQ-5D dimension names
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector of length 2, specifying the order of the follow-up levels (e.g. c("Pre-op","Post-op"))
#' @return A list containing:
#'   \item{plot_data}{A data frame of improvements by group and dimension}
#'   \item{p}{A ggplot2 object produced by `.pchc_plot_by_dim()`}
#' @export
#'
#' @examples
#' result <- figure_1_2_2(
#'   df = example_data,
#'   name_id = "id",
#'   name_groupvar = "procedure",
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op","Post-op")
#' )
#' result$p
#' result$plot_data
#'
figure_1_2_2 <- function(df,
                         name_id,
                         name_groupvar,
                         names_eq5d = NULL,
                         name_fu = NULL,
                         levels_fu = NULL) {
  ### 1) Data Preparation ###
  
  # Replace NULL names with defaults
  temp <- .get_names(df = df, 
                     names_eq5d = names_eq5d, 
                     name_fu = name_fu, 
                     levels_fu = levels_fu)
  names_eq5d <- temp$names_eq5d
  name_fu    <- temp$name_fu
  levels_fu  <- temp$levels_fu
  
  # Check columns exist
  names_all <- c(name_id, name_groupvar, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df))) {
    stop("Provided column names not in dataframe. Stopping.")
  }
  
  # Keep only relevant columns
  df <- df[, names_all, drop = FALSE]

  # Rename internal columns
  names(df)[names(df) == name_id]       <- "id"
  names(df)[names(df) == name_groupvar] <- "groupvar"

  # Prepare EQ-5D & Follow-up columns
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)

  # Sort by (id, groupvar, fu)
  df <- df[order(df$id, df$groupvar, df$fu), , drop = FALSE]

  # Ensure uniqueness of (id, groupvar, fu)
  .check_uniqueness(df, group_by = c("id", "groupvar", "fu"))

  ### 2) Analysis ###
  # .pchc() calculates difference columns like mo_diff, sc_diff, etc.
  df <- .pchc(df = df, level_fu_1 = levels_fu[1])
  df <- df[!is.na(df$state), , drop = FALSE]

  # Focus on those whose overall 'state' == "Improve"
  # then check dimension-specific improvements (e.g. mo_diff > 0)
  dimension_names <- c("mo_diff", "sc_diff", "ua_diff", "pd_diff", "ad_diff")

  df_imp <- df[df$state == "Improve", c("groupvar", "fu", dimension_names), drop = FALSE]
  # pivot longer manually
  df_long <- do.call(rbind, lapply(dimension_names, function(dn) {
    data.frame(groupvar = df_imp$groupvar,
               fu       = df_imp$fu,
               name     = dn,
               value    = df_imp[[dn]],
               stringsAsFactors = FALSE)
  }))
  # group_by(groupvar, fu, name) summarise
  plot_data <- do.call(rbind, lapply(
    split(df_long, list(df_long$groupvar, df_long$fu, df_long$name), drop = TRUE),
    function(sub) {
      data.frame(groupvar = sub$groupvar[1],
                 fu       = sub$fu[1],
                 name     = sub$name[1],
                 n        = sum(sub$value > 0),
                 n_total  = nrow(sub),
                 stringsAsFactors = FALSE)
    }
  ))
  rownames(plot_data) <- NULL
  plot_data$p <- plot_data$n / plot_data$n_total
  # Convert dimension code to a nice label
  plot_data$name <- factor(
    plot_data$name,
    levels = dimension_names,
    labels = c("Mobility", "Self-care", "Usual activities", "Pain & Discomfort", "Anxiety & Depression")
  )
  
  ### 3) Plot ###
  # We assume .pchc_plot_by_dim() is a helper function that
  # takes a data frame w/ columns: groupvar, fu, name, p
  # and returns a ggplot object. Alternatively, you can build your own chart here.
  
  p <- .pchc_plot_by_dim(
    plot_data = plot_data,
    ylab      = "Percentage of respondents who improved overall",
    title     = "Percentage of respondents who improved overall \nby the dimensions in which they improved (%)",
    cols      = .gen_colours("green", length(unique(plot_data$groupvar)))
  )
  
  ### 4) Tidy Up & Return ###
  # Reshape from long to wide for final table
  # e.g. columns for each fu within each groupvar
  # If you want separate columns by groupvar as well, you can unify them in a single pivot.
  
  # Pivot only by fu, keeping groupvar as id col
  plot_data_wide <- stats::reshape(
    plot_data[, c("groupvar", "name", "fu", "p")],
    idvar     = c("groupvar", "name"),
    timevar   = "fu",
    direction = "wide"
  )
  names(plot_data_wide) <- sub("^p\\.", "", names(plot_data_wide))
  plot_data_wide[is.na(plot_data_wide)] <- 0
  names(plot_data_wide)[names(plot_data_wide) == "groupvar"] <- "Group"
  names(plot_data_wide)[names(plot_data_wide) == "name"]     <- "Values"

  return(list(
    plot_data = plot_data_wide,
    p         = .modify_ggplot_theme(p = p) # if you have a custom theme function
  ))
}


#’ Figure 1.2.3: Percentage of Respondents Who Worsened in Each EQ-5D Dimension, by Group
#'
#' This function identifies respondents with a "Worsen" PCHC state (i.e., overall
#' health state got worse between levels_fu[1] and levels_fu[2]), checks
#' dimension-specific changes (e.g., mo_diff < 0), and summarizes by a grouping variable
#' (e.g., procedure) and time points. It returns a data table and a ggplot object.
#'
#' @param df A data frame containing EQ-5D columns, a grouping variable, an ID column, and a follow-up column
#' @param name_id A character string for the patient ID column
#' @param name_groupvar A character string for the grouping column (e.g., procedure)
#' @param names_eq5d A character vector of EQ-5D dimension names
#' @param name_fu A character string for the follow-up column
#' @param levels_fu A character vector of length 2, specifying the order of the follow-up levels (e.g., c("Pre-op","Post-op"))
#'
#' @return A list containing:
#'   \item{plot_data}{A data frame of "Worsen" percentages by group and dimension}
#'   \item{p}{A ggplot2 object produced by `.pchc_plot_by_dim()`}
#' @export
#'
#' @examples
#' result <- figure_1_2_3(
#'   df = example_data,
#'   name_id = "id",
#'   name_groupvar = "procedure",
#'   names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op","Post-op")
#' )
#' result$p        # shows the plot
#' result$plot_data  # shows the summary table
#'
figure_1_2_3 <- function(df,
                         name_id,
                         name_groupvar,
                         names_eq5d = NULL,
                         name_fu = NULL,
                         levels_fu = NULL) {
  
  ### 1) Data Preparation ###
  
  # Replace NULL names with defaults (helper function that sets names_eq5d, name_fu, etc.)
  temp <- .get_names(df = df,
                     names_eq5d = names_eq5d,
                     name_fu = name_fu,
                     levels_fu = levels_fu)
  
  names_eq5d <- temp$names_eq5d
  name_fu    <- temp$name_fu
  levels_fu  <- temp$levels_fu
  
  # Check columns exist
  names_all <- c(name_id, name_groupvar, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df))) {
    stop("Provided column names not in dataframe. Stopping.")
  }
  
  # Keep only relevant columns
  df <- df[, names_all, drop = FALSE]

  # Rename internal columns
  names(df)[names(df) == name_id]       <- "id"
  names(df)[names(df) == name_groupvar] <- "groupvar"

  # Prepare EQ-5D & Follow-up columns
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)

  # Sort by (id, groupvar, fu)
  df <- df[order(df$id, df$groupvar, df$fu), , drop = FALSE]

  # Ensure uniqueness of (id, groupvar, fu)
  .check_uniqueness(df, group_by = c("id", "groupvar", "fu"))

  ### 2) Analysis: PCHC ###

  # .pchc() calculates difference columns like mo_diff, sc_diff, etc.
  # level_fu_1 is the 'baseline' time point.
  df <- .pchc(df = df, level_fu_1 = levels_fu[1])
  df <- df[!is.na(df$state), , drop = FALSE]

  # We focus on those whose overall state == "Worsen"
  # then check dimension-specific changes, e.g., mo_diff < 0
  dimension_names <- c("mo_diff", "sc_diff", "ua_diff", "pd_diff", "ad_diff")

  df_wor <- df[df$state == "Worsen", c("groupvar", "fu", dimension_names), drop = FALSE]
  # pivot longer manually
  df_long <- do.call(rbind, lapply(dimension_names, function(dn) {
    data.frame(groupvar = df_wor$groupvar,
               fu       = df_wor$fu,
               name     = dn,
               value    = df_wor[[dn]],
               stringsAsFactors = FALSE)
  }))
  # group_by(groupvar, fu, name) summarise
  plot_data <- do.call(rbind, lapply(
    split(df_long, list(df_long$groupvar, df_long$fu, df_long$name), drop = TRUE),
    function(sub) {
      data.frame(groupvar = sub$groupvar[1],
                 fu       = sub$fu[1],
                 name     = sub$name[1],
                 n        = sum(sub$value < 0),
                 n_total  = nrow(sub),
                 stringsAsFactors = FALSE)
    }
  ))
  rownames(plot_data) <- NULL
  plot_data$p <- plot_data$n / plot_data$n_total
  # Convert dimension code to a nice label
  plot_data$name <- factor(
    plot_data$name,
    levels = dimension_names,
    labels = c("Mobility", "Self-care", "Usual activities", "Pain & Discomfort", "Anxiety & Depression")
  )
  
  ### 3) Plot ###
  
  # We assume .pchc_plot_by_dim() is a helper function that
  # creates a bar chart from plot_data with columns:
  # groupvar, fu, name, and p
  # The color palette is "orange" for Worsen by dimension
  p <- .pchc_plot_by_dim(
    plot_data = plot_data,
    ylab      = "Percentage of respondents who worsened overall",
    title     = "Percentage of respondents who worsened overall \nby the dimensions in which they got worse (%)",
    cols      = .gen_colours("orange", length(unique(plot_data$groupvar)))
  )
  
  ### 4) Tidy Up & Return ###

  # Reshape from long to wide for final table
  plot_data_wide <- stats::reshape(
    plot_data[, c("groupvar", "name", "fu", "p")],
    idvar     = c("groupvar", "name"),
    timevar   = "fu",
    direction = "wide"
  )
  names(plot_data_wide) <- sub("^p\\.", "", names(plot_data_wide))
  plot_data_wide[is.na(plot_data_wide)] <- 0
  names(plot_data_wide)[names(plot_data_wide) == "groupvar"] <- "Group"
  names(plot_data_wide)[names(plot_data_wide) == "name"]     <- "Values"

  return(list(
    plot_data = plot_data_wide,
    p         = .modify_ggplot_theme(p = p) # if you have a custom theme function
  ))
}


#' Figure 1.2.4: Percentage of Respondents Who Had a Mixed Change Overall,
#' by Dimension Improved or Worsened, Grouped by Procedure (or Other Grouping)
#'
#' This function focuses on patients classified as having "Mixed change" overall
#' (i.e., some dimensions improved, others worsened). It then examines which dimensions
#' improved vs. worsened for each subject. Results are summarized by a grouping variable
#' (e.g., procedure) and time points. The final output is a table plus a ggplot object.
#'
#' @param df Data frame containing columns for EQ-5D dimensions, a grouping variable,
#' a patient ID, and a follow-up variable
#' @param name_id Character string indicating the patient ID column
#' @param name_groupvar Character string for the grouping column (e.g. "procedure")
#' @param names_eq5d Character vector naming the EQ-5D dimensions (e.g. c("mo","sc","ua","pd","ad"))
#' @param name_fu Character string for the follow-up column (e.g. "time")
#' @param levels_fu Character vector of length 2 specifying the time order (e.g. c("Pre-op","Post-op"))
#'
#' @return A list with two elements:
#'   \item{plot_data}{A wide-format data frame of dimension-specific improvements/worsenings for "Mixed change"}
#'   \item{p}{A ggplot2 object showing a dimension-level bar chart from .pchc_plot_by_dim}
#'
#' @export
#'
#' @examples
#' result <- figure_1_2_4(
#'   df = example_data,
#'   name_id = "id",
#'   name_groupvar = "procedure",
#'   names_eq5d = c("mo","sc","ua","pd","ad"),
#'   name_fu = "time",
#'   levels_fu = c("Pre-op","Post-op")
#' )
#' result$plot_data
#' result$p
#'
figure_1_2_4 <- function(df,
                         name_id,
                         name_groupvar,
                         names_eq5d = NULL,
                         name_fu = NULL,
                         levels_fu = NULL) {
  
  ### 1) Data Preparation ###
  
  # Replace NULL names with defaults
  temp <- .get_names(df = df,
                     names_eq5d = names_eq5d,
                     name_fu = name_fu,
                     levels_fu = levels_fu)
  names_eq5d <- temp$names_eq5d
  name_fu    <- temp$name_fu
  levels_fu  <- temp$levels_fu
  
  # Check columns exist
  names_all <- c(name_id, name_groupvar, names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df))) {
    stop("Provided column names not in dataframe. Stopping.")
  }
  
  # Keep only relevant columns
  df <- df[, names_all, drop = FALSE]

  # Rename for internal use
  names(df)[names(df) == name_id]       <- "id"
  names(df)[names(df) == name_groupvar] <- "groupvar"

  # Prepare EQ-5D & Follow-up columns
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)

  # Sort by (id, groupvar, fu)
  df <- df[order(df$id, df$groupvar, df$fu), , drop = FALSE]

  # Check uniqueness
  .check_uniqueness(df, group_by = c("id", "groupvar", "fu"))

  ### 2) Analysis: Identify Mixed Change ###

  # .pchc() adds difference columns like mo_diff, sc_diff, etc.
  # level_fu_1 is the 'baseline' time point
  df <- .pchc(df = df, level_fu_1 = levels_fu[1])
  df <- df[!is.na(df$state), , drop = FALSE]

  # We'll focus on those with "Mixed change" overall
  dimension_names <- c("mo_diff", "sc_diff", "ua_diff", "pd_diff", "ad_diff")

  # We also define the fu values and directions for labeling
  n_time <- length(levels_fu)
  # For example: "Improve: Pre-op", "Improve: Post-op", "Worsen: Pre-op", "Worsen: Post-op"
  levels_fu_change <- c(
    paste0("Improve: ", levels_fu),
    paste0("Worsen: ", levels_fu)
  )

  # 2a) Summarize dimension-level improvements & worsenings among "Mixed change" patients
  df_mix <- df[df$state == "Mixed change", c("groupvar", "fu", dimension_names), drop = FALSE]
  # pivot longer manually
  df_long <- do.call(rbind, lapply(dimension_names, function(dn) {
    data.frame(groupvar = df_mix$groupvar,
               fu       = df_mix$fu,
               name     = dn,
               value    = df_mix[[dn]],
               stringsAsFactors = FALSE)
  }))
  # group_by(groupvar, fu, name) summarise
  agg <- do.call(rbind, lapply(
    split(df_long, list(df_long$groupvar, df_long$fu, df_long$name), drop = TRUE),
    function(sub) {
      data.frame(groupvar  = sub$groupvar[1],
                 fu        = as.character(sub$fu[1]),
                 name      = sub$name[1],
                 n_improve = sum(sub$value > 0),
                 n_worsen  = sum(sub$value < 0),
                 n_total   = nrow(sub),
                 stringsAsFactors = FALSE)
    }
  ))
  rownames(agg) <- NULL
  agg$Improve <- agg$n_improve / agg$n_total
  agg$Worsen  <- agg$n_worsen  / agg$n_total
  # Convert e.g. "mo_diff" -> "Mobility"
  agg$name <- factor(
    agg$name,
    levels = dimension_names,
    labels = c("Mobility", "Self-care", "Usual activities", "Pain & Discomfort", "Anxiety & Depression")
  )
  # pivot Improve & Worsen into single column
  plot_data_imp <- data.frame(groupvar = agg$groupvar, fu = agg$fu, name = agg$name,
                               change = "Improve", p = agg$Improve, stringsAsFactors = FALSE)
  plot_data_wor <- data.frame(groupvar = agg$groupvar, fu = agg$fu, name = agg$name,
                               change = "Worsen",  p = agg$Worsen,  stringsAsFactors = FALSE)
  plot_data <- rbind(plot_data_imp, plot_data_wor)
  # Build combined factor "Improve: Pre-op" / "Worsen: Pre-op" etc.
  plot_data$fu <- factor(paste0(plot_data$change, ": ", plot_data$fu), levels = levels_fu_change)
  
  ### 3) Plot ###
  
  plot_data2 <- plot_data
  plot_data2$groupvar <- paste(plot_data$groupvar, plot_data$fu)
  
  plot_data2$name   <- factor(plot_data2$name,   levels = c("Mobility", "Self-care", "Usual activities", "Pain & Discomfort", "Anxiety & Depression"))
  plot_data2$change <- factor(plot_data2$change, levels = c("Improve", "Worsen"))
  plot_data2 <- plot_data2[order(plot_data2$name, plot_data2$change), , drop = FALSE]
  
  plot_data2$groupvar <- factor(plot_data2$groupvar, levels = unique(plot_data2$groupvar))
  
  p <- .pchc_plot_by_dim(
    plot_data = plot_data2,
    ylab      = "Percentage of respondents who had mixed change",
    title     = "Percentage of respondents who had a mixed change overall \nby the dimensions in which they improved and worsened (%)",
    cols      = c(
      .gen_colours("green", n_time*2),  # for "Improve: ...",
      .gen_colours("orange", n_time*2)  # for "Worsen: ..."
    ),
    text_rotate = TRUE
  )
  
  ### 4) Reshape & Return ###

  # Pivot wide by the combined fu factor
  plot_data_wide <- stats::reshape(
    plot_data[, c("groupvar", "name", "fu", "p")],
    idvar     = c("groupvar", "name"),
    timevar   = "fu",
    direction = "wide"
  )
  names(plot_data_wide) <- sub("^p\\.", "", names(plot_data_wide))
  plot_data_wide[is.na(plot_data_wide)] <- 0
  names(plot_data_wide)[names(plot_data_wide) == "groupvar"] <- "Group"
  names(plot_data_wide)[names(plot_data_wide) == "name"]     <- "Values"

  return(list(
    plot_data = plot_data_wide,
    p         = .modify_ggplot_theme(p = p)
  ))
}

#' Figure 1.2.5: Health Profile Grid (HPG) for Two Time Points
#'
#' This function creates a Health Profile Grid (HPG) for EQ-5D data, plotting each
#' individual's change in health states (ranked from best to worst) between two time
#' points. A diagonal reference line indicates no change; points above the line reflect
#' improvement, and points below indicate deterioration.
#'
#' @param df A data frame containing EQ-5D columns, a grouping variable, an ID column, and a follow-up column
#' @param names_eq5d A character vector of EQ-5D dimension names
#' @param name_fu A character string for the follow-up column
#' @param levels_fu A character vector of length 2, specifying the order of the follow-up levels (e.g., c("Pre-op","Post-op"))
#' @param name_id A character string for the patient ID column
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' @return A list with components:
#'   \item{plot_data}{The plot data with ranks and classification.}
#'   \item{p}{A \code{ggplot2} object displaying the HPG scatter plot.}
#' @export
#' @examples
#' tmp <- figure_1_2_5(
#'            df = example_data, 
#'            names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'            name_fu = "time", 
#'            levels_fu = c("Pre-op", "Post-op"), 
#'            name_id = "id", 
#'            eq5d_version = "3L", 
#'            country = "UK"
#'        )

figure_1_2_5 <- function(df,
                         names_eq5d,
                         name_fu,
                         levels_fu = NULL,
                         name_id,
                         eq5d_version,
                         country) {
  ### 1) Data Preparation ###
  # Replace NULL names with defaults (helper function that sets names_eq5d, name_fu, etc.)
  temp <- .get_names(df = df,
                     names_eq5d = names_eq5d,
                     name_fu = name_fu,
                     levels_fu = levels_fu,
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  name_fu    <- temp$name_fu
  levels_fu  <- temp$levels_fu
  eq5d_version <- temp$eq5d_version
  
  # Check columns exist
  names_all <- c(name_id,  names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df))) {
    stop("Provided column names not in dataframe. Stopping.")
  }
  
  # Keep only relevant columns
  df <- df[, names_all, drop = FALSE]

  # Rename for internal use
  names(df)[names(df) == name_id] <- "id"

  # Prepare EQ-5D & Follow-up columns
  df <- .prep_eq5d(df = df, names = names_eq5d)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)

  # Sort by (id, fu)
  df <- df[order(df$id, df$fu), , drop = FALSE]

  # Check uniqueness: each (id, fu) should appear at most once
  .check_uniqueness(df, group_by = c("id", "fu"))

  ### 2) Calculate PCHC (Paretian Classification) ###
  # .pchc() compares the first level in levels_fu (e.g. "Pre-op") vs. the second (e.g. "Post-op")
  df <- .pchc(df = df, level_fu_1 = levels_fu[1], add_noprobs = TRUE)
  df <- df[!is.na(df$state), , drop = FALSE]
  # The factor levels for PCHC categories can be: "No problems", "No change", "Improve", "Worsen", "Mixed change"
  df$state_noprobs <- factor(
    df$state_noprobs,
    levels = c("No problems", "No change", "Improve", "Worsen", "Mixed change")
  )

  ### 3) Summarize for HPG ###
  # Create value set
  vs <- data.frame(profile = make_all_EQ_indexes(version = eq5d_version))
  vs$utility <- eq5d(vs$profile, country = country, version = eq5d_version)
  vs <- vs[order(-vs$utility), , drop = FALSE]
  vs$rank <- seq_len(nrow(vs))
  
  # Build profiles
  eqdims <- names_eq5d
  for (d in eqdims) {
    df[[paste0(d, "_t2")]] <- df[[d]]
    df[[paste0(d, "_t1")]] <- df[[d]] + df[[paste0(d, "_diff")]]
  }
  # Build profile codes row-by-row using apply
  t1_cols <- paste0(eqdims, "_t1")
  t2_cols <- paste0(eqdims, "_t2")
  df$profile_t1 <- as.integer(apply(df[, t1_cols, drop = FALSE], 1,
                                     function(r) paste0(r, collapse = "")))
  df$profile_t2 <- as.integer(apply(df[, t2_cols, drop = FALSE], 1,
                                     function(r) paste0(r, collapse = "")))

  # Get rank via match
  df$rank_t1 <- vs$rank[match(df$profile_t1, vs$profile)]
  df$rank_t2 <- vs$rank[match(df$profile_t2, vs$profile)]
  
  # Cteate plot
  max_rank <- nrow(vs)
  p <- ggplot(df, aes(x = .data$rank_t2, y = .data$rank_t1, color = .data$state, shape = .data$state)) +
    geom_point(size = 3) +
    geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "black") +
    scale_x_continuous(limits = c(1, max_rank), expand = c(0,0)) +
    scale_y_continuous(limits = c(1, max_rank), expand = c(0,0)) +
    labs(
      title    = "Health Profile Grid (HPG)",
      x        = paste0(levels_fu[1], " rank"),
      y        = paste0(levels_fu[2], " rank"),
      color    = "Classification",  # Legend title for color
      shape    = "Classification"   # Legend title for shape
    ) +
    scale_shape_manual(values = c(
      "Improve"    = 3,  
      "No change"  = 16,  
      "Worsen"     = 4,   
      "Mixed change" = 17  
    )) +
    scale_color_manual(values = c(
      "Improve"      = "#69AB3E",
      "No change"    = "#FDC020",
      "Worsen"       = "#BF2518",
      "Mixed change" = "#5782BF"
    )) +
    
    theme_minimal(base_size = 14) + 
    theme(
      panel.grid = element_blank(),
      axis.line  = element_line(color = "black"),
      legend.position  = "bottom",
      legend.direction = "horizontal",
      legend.box       = "horizontal"
    ) +
    coord_fixed(ratio = 1)
  
  return(list(plot_data = df, p = p))
}



#' Figure 1.3.1: EQ-5D values plotted against LSS
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' df <- data.frame(make_all_EQ_states(version = "5L"))
#' tmp <- figure_1_3_1(
#'  df, 
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'  eq5d_version = "5L", 
#'  country = "US"
#' )
#' tmp$p
#' tmp$plot_data

figure_1_3_1 <- function(df,
                       names_eq5d = NULL,
                       eq5d_version = NULL,
                       country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_lss = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, c("lss", "utility"), drop = FALSE]

  ### analysis ###

  # prepare data for plotting: remove NAs, group by lss, summarise
  df2 <- df[!is.na(df$lss) & !is.na(df$utility), , drop = FALSE]
  plot_data <- do.call(rbind, lapply(split(df2$utility, df2$lss), function(vals) {
    data.frame(median = median(vals), min = min(vals), max = max(vals))
  }))
  plot_data$lss <- as.integer(rownames(plot_data))
  rownames(plot_data) <- NULL
  plot_data <- plot_data[order(plot_data$lss), , drop = FALSE]

  # graphical parameters
  x_breaks <- 1:100
  y_breaks <- seq(from = -1, to = 1, by = 0.2)

  # pivot longer for segment plot
  seg_data <- do.call(rbind, list(
    data.frame(lss = plot_data$lss, name = "Median", value = plot_data$median),
    data.frame(lss = plot_data$lss, name = "Lowest",  value = plot_data$min),
    data.frame(lss = plot_data$lss, name = "Highest", value = plot_data$max)
  ))
  seg_data$name <- factor(seg_data$name, levels = c("Median", "Lowest", "Highest"))

  # plot
  p <- ggplot() +
    # plot median, min and max
    geom_segment(data = seg_data,
                 aes(x = .data$lss - 0.2, xend = .data$lss + 0.2,
                     y = .data$value, yend = .data$value,
                     colour = .data$name)) +
    # connect values with a line segment
    geom_segment(data = plot_data, aes(x = .data$lss, xend = .data$lss, y = min, yend = max)) +
    # plot title
    ggtitle("EQ-5D values plotted against the LSS (Level Sum Score)") +
    # manipulate x-axis
    scale_x_continuous(name = "Level Sum Score",
                       breaks = x_breaks,
                       expand = expansion(add = c(0.5, 0.5))) +
    # manipulate y-axis
    scale_y_continuous(name = "EQ-5D value",
                       breaks = y_breaks)

  # tidy up summary
  names(plot_data)[names(plot_data) == "lss"]    <- "LSS"
  names(plot_data)[names(plot_data) == "median"] <- "Median"
  names(plot_data)[names(plot_data) == "min"]    <- "Minimum"
  names(plot_data)[names(plot_data) == "max"]    <- "Maximum"
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 1.3.2: EQ-5D values plotted against LFS
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_1_3_2(
#'  example_data, 
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'  eq5d_version = "3L",
#'  country = "UK"
#' )
#' tmp$p
#' tmp$plot_data

figure_1_3_2 <- function(df,
                        names_eq5d = NULL,
                        eq5d_version = NULL,
                        country){
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_lfs = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, c("lfs", "utility"), drop = FALSE]

  ### analysis ###

  # prepare data for plotting: remove NAs, group by lfs, summarise
  df2 <- df[!is.na(df$lfs) & !is.na(df$utility), , drop = FALSE]
  plot_data <- do.call(rbind, lapply(split(df2$utility, df2$lfs), function(vals) {
    data.frame(median = median(vals), min = min(vals), max = max(vals))
  }))
  plot_data$lfs <- rownames(plot_data)
  rownames(plot_data) <- NULL
  # order by descending median
  plot_data <- plot_data[order(-plot_data$median), , drop = FALSE]

  # impose order on the x-axis
  lfs_levels <- plot_data$lfs
  plot_data$lfs_f <- factor(plot_data$lfs, levels = lfs_levels)
  
  # axis breaks
  n <- length(lfs_levels)
  i <- seq(from = 1, to = n, by = 3)
  x_breaks <- (1:n)[i]
  x_labels <- lfs_levels[i]
  y_breaks <- seq(from = -1, to = 1, by = 0.2)
  
  # pivot longer for segment plot
  seg_data2 <- do.call(rbind, list(
    data.frame(lfs = plot_data$lfs, lfs_f = plot_data$lfs_f, name = "Median", value = plot_data$median),
    data.frame(lfs = plot_data$lfs, lfs_f = plot_data$lfs_f, name = "Lowest",  value = plot_data$min),
    data.frame(lfs = plot_data$lfs, lfs_f = plot_data$lfs_f, name = "Highest", value = plot_data$max)
  ))
  seg_data2$name <- factor(seg_data2$name, levels = c("Median", "Lowest", "Highest"))

  # plot
  p <- ggplot() +
    # plot median, min and max
    geom_segment(data = seg_data2,
                 aes(x = as.numeric(.data$lfs_f) - 0.5, xend = as.numeric(.data$lfs_f) + 0.5,
                     y = .data$value, yend = .data$value,
                     colour = .data$name)) +
    # connect values with a line segment
    geom_segment(data = plot_data, aes(x = as.numeric(.data$lfs_f), xend = as.numeric(.data$lfs_f), y = min, yend = max)) +
    # plot title
    ggtitle("EQ-5D values plotted against the LFS (Level Frequency Score)") +
    # manipulate x-axis
    scale_x_continuous(name = "Level Frequency Score", 
                       breaks = x_breaks,
                       labels = x_labels,
                       expand = expansion(add = c(0.5, 0.5))) +
    # manipulate y-axis
    scale_y_continuous(name = "EQ-5D value",
                       breaks = y_breaks)
  p <- .modify_ggplot_theme(p = p) + 
    # tidy up the graph
    theme(
      # rotate labels in the x-axis
      axis.text.x = element_text(angle = 90))
  
  # tidy up summary
  plot_data <- plot_data[, setdiff(names(plot_data), "lfs_f"), drop = FALSE]
  names(plot_data)[names(plot_data) == "lfs"]    <- "LFS"
  names(plot_data)[names(plot_data) == "median"] <- "Median"
  names(plot_data)[names(plot_data) == "min"]    <- "Minimum"
  names(plot_data)[names(plot_data) == "max"]    <- "Maximum"
  
  return(list(plot_data = plot_data, p = p))
}

#' Figure 2.1: EQ VAS scores
#' 
#' @param df Data frame with the VAS column
#' @param name_vas Character string for the VAS column
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_2_1(example_data, name_vas = 'vas')
#' tmp$p
#' tmp$plot_data

figure_2_1 <- function(df, name_vas = NULL){
  
  ### data preparation ###
  # replace NULL names with defaults
  temp <- .get_names(name_vas = name_vas)
  name_vas <- temp$name_vas
  # check existence of columns 
  names_all <- c(name_vas)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_vas(df = df, name = name_vas)
  
  ### analysis ###
  
  # plot
  p <- ggplot(df, aes(x = .data$vas)) +
    geom_bar(width = 0.1, fill = "blue") + 
    scale_x_continuous(name = "EQ VAS score", 
                       breaks = seq(from = 0, to = 100, by = 5),
                       limits = c(-3, 103),
                       expand = c(0, 0)) +
    scale_y_continuous(name = "Frequency",
                       expand = expansion(mult = c(0, 0.05))) + 
    ggtitle("EQ VAS Scores") + 
    # tidy up the graph
    theme(
      # reinstate x-axis ticks
      axis.ticks.x = element_line(colour = "grey50"))
  
  # output plotting data: count for each value between 0 & 100
  vas_factor <- factor(df$vas, levels = 1:100)
  tbl <- tabulate(vas_factor, nbins = 100L)
  plot_data <- data.frame(vas = factor(1:100, levels = 1:100), count = tbl,
                          stringsAsFactors = FALSE)
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 2.2: Mid-point EQ VAS scores
#' 
#' @param df Data frame with the VAS column
#' @param name_vas Character string for the VAS column
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_2_2(example_data, name_vas = 'vas')
#' tmp$p
#' tmp$plot_data

figure_2_2 <- function(df, name_vas = NULL){
  
  # produce output to be plotted (Table 3.2)
  # do not add totals
  plot_data <- table_2_2(df = df, name_vas = name_vas, add_na_total = FALSE)
  # remove Range column, not needed
  plot_data <- plot_data[, setdiff(names(plot_data), "Range"), drop = FALSE]
  
  # plot
  p <- ggplot(plot_data, aes(x = .data$Midpoint, y = .data$Frequency)) +
    geom_bar(width = 0.7, fill = "blue", stat = "identity") + coord_flip() +
    scale_x_continuous(name = "EQ VAS score (Midpoint)", 
                       breaks = seq(from = 0, to = 100, by = 10),
                       limits = c(-3, 103),
                       expand = c(0, 0)) +
    scale_y_continuous(name = "Frequency",
                       expand = expansion(mult = c(0, 0.05))) + 
    ggtitle("EQ VAS Scores (Midpoint)") +
    # tidy up the graph
    theme(
      # reinstate x-axis ticks
      axis.ticks.x = element_line(colour = "grey50"),
      # remove y-axis ticks
      axis.ticks.y = element_blank())
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 3.1: EQ-5D values by timepoints: mean values and 95\% confidence intervals
#' 
#' @param df Data frame with the VAS columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_3_1(
#'  example_data,
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'  name_fu = "time",
#'  levels_fu = c('Pre-op', 'Post-op'),
#'  eq5d_version = "3L",
#'  country = "UK"
#' )
#' tmp$p
#' tmp$plot_data

figure_3_1 <- function(df,
                       names_eq5d = NULL,
                       name_fu = NULL,
                       levels_fu = NULL,
                       eq5d_version = NULL,
                       country) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu,
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d, name_fu)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  df <- df[, c("fu", "utility"), drop = FALSE]
  
  ### analysis ###
  
  # prepare data for plotting
  plot_data <- .summary_mean_ci(df = df, group_by = "fu")
  
  # plot
  p <- ggplot(data = plot_data, aes(x = .data$fu, 
                                    y = .data$mean, ymin = .data$ci_lb, ymax = .data$ci_ub)) +
    # connect means with a line
    # create a dummy group variable as geom_line only connects points that belong to the same group
    geom_line(aes(group = 1), colour = "red") +
    # plot means
    geom_point(colour = "green") + 
    # add error bars
    geom_errorbar(width = 0.1) +
    # plot title
    ggtitle(paste0("EQ-5D values by ", name_fu, ": mean and 95% confidence intervals")) +
    # manipulate x-axis
    scale_x_discrete(name = "time") +
    # manipulate y-axis
    scale_y_continuous(name = "EQ-5D value")
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 3.2: Mean EQ-5D values and 95\% confidence intervals: all vs by groupvar
#' 
#' @param df Data frame with the EQ-5D and grouping columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_groupvar Character string for the grouping column
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_3_2(
#'  example_data,
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'  name_groupvar = "procedure",
#'  eq5d_version = "3L",
#'  country = "UK"
#' )
#' tmp$p
#' tmp$plot_data

figure_3_2 <- function(df,
                       names_eq5d = NULL,
                       name_groupvar,
                       eq5d_version = NULL,
                       country) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(name_groupvar, names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_groupvar] <- "groupvar"
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, c("groupvar", "utility"), drop = FALSE]

  ### analysis ###

  # prepare data for plotting
  plot_data <- .summary_mean_ci(df = df, group_by = "groupvar")

  # add totals
  plot_data_total <- .summary_mean_ci(df = df, group_by = NULL)
  plot_data_total$groupvar <- "All groups"
  plot_data_total <- plot_data_total[, c("groupvar", setdiff(names(plot_data_total), "groupvar")), drop = FALSE]

  # combine
  groupvar_levels <- c(plot_data$groupvar, "All groups")
  plot_data <- rbind(plot_data, plot_data_total)
  plot_data$groupvar <- factor(plot_data$groupvar, levels = groupvar_levels)
  plot_data <- plot_data[order(plot_data$groupvar), , drop = FALSE]
  
  # plot
  p <- ggplot(data = plot_data, aes(x = .data$groupvar, y = .data$mean, ymin = .data$ci_lb, ymax = .data$ci_ub)) +
    # plot means
    geom_bar(colour = "blue", fill = "blue", stat = "identity", width = 0.5) + 
    geom_point() +
    # add error bars
    geom_errorbar(width = 0.1, size = 1) +
    # plot title
    ggtitle(paste0("Mean EQ-5D values 95% confidence intervals: all vs by ", name_groupvar)) +
    # manipulate x-axis
    scale_x_discrete(name = name_groupvar) +
    # manipulate y-axis
    scale_y_continuous(name = "EQ-5D value",
                       limits = c(0, 1),
                       breaks = seq(from = 0, to = 1, by = 0.1),
                       expand = c(0, 0))
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 3.3: EQ-5D values: smoothed lines and confidence intervals by groupvar
#' 
#' @param df Data frame with the EQ-5D, follow-up and grouping columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_fu Character string for the follow-up column
#' @param levels_fu Character vector containing the order of the values in the follow-up column. 
#' If NULL (default value), the levels will be ordered in the order of appearance in df.
#' @param name_groupvar Character string for the grouping column
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_3_3(
#'  example_data,
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'  name_fu = "time",
#'  levels_fu = c('Pre-op', 'Post-op'),
#'  name_groupvar = "procedure",
#'  eq5d_version = "3L",
#'  country = "UK"
#' )
#' tmp$p
#' tmp$plot_data

figure_3_3 <- function(df, 
                       names_eq5d = NULL,
                       name_fu = NULL,
                       levels_fu = NULL,
                       name_groupvar,
                       eq5d_version = NULL,
                       country) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(df = df, names_eq5d = names_eq5d, 
                     name_fu = name_fu, levels_fu = levels_fu,
                     eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  name_fu <- temp$name_fu
  levels_fu <- temp$levels_fu
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d, name_fu, name_groupvar)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  names(df)[names(df) == name_groupvar] <- "groupvar"
  df$groupvar <- factor(df$groupvar)
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- .prep_fu(df = df, name = name_fu, levels = levels_fu)
  df <- df[, c("fu", "groupvar", "utility"), drop = FALSE]
  
  ### analysis ###
  
  # prepare data for plotting
  plot_data <- .summary_mean_ci(df = df, group_by = c("fu", "groupvar"))
  
  # plot
  p <- ggplot(data = plot_data, aes(x = .data$fu, 
                                    y = .data$mean, ymin = .data$ci_lb, ymax = .data$ci_ub,
                                    group = .data$groupvar, colour = .data$groupvar)) +
    # plot means
    geom_point() +
    # add error bars
    geom_errorbar(width = 0.1, alpha = 0.5) +
    # connect means with a smooth line
    geom_line() +
    # plot title
    ggtitle(paste0("Longitudinal EQ-5D values by ", name_fu, " and ", name_groupvar, ":\nmeans and 95% confidence intervals")) +
    # manipulate x-axis
    scale_x_discrete(name = name_fu) +
    # manipulate y-axis
    scale_y_continuous(name = "EQ-5D value")
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 3.4: EQ-5D values: smoothed lines and confidence intervals by groupvar
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_3_4(
#'  example_data,
#'  names_eq5d = c("mo", "sc", "ua", "pd", "ad"), 
#'  eq5d_version = "3L",
#'  country = "UK"
#' )
#' tmp$p
#' tmp$plot_data

figure_3_4 <- function(df, 
                       names_eq5d = NULL,
                       eq5d_version = NULL,
                       country) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, "utility", drop = FALSE]

  ### analysis ###

  # prepare data for plotting
  util_tbl <- table(df$utility)
  plot_data <- data.frame(
    utility = as.numeric(names(util_tbl)),
    count   = as.integer(util_tbl),
    stringsAsFactors = FALSE
  )
  plot_data <- plot_data[order(plot_data$utility), , drop = FALSE]
  
  # plot
  p <- ggplot(plot_data, aes(x = .data$utility, y = .data$count)) +
    geom_bar(fill = "blue", stat = "identity") + 
    scale_x_continuous(name = "EQ-5D value") +
    scale_y_continuous(name = "Frequency") + 
    ggtitle("Distribution of EQ-5D values") + 
    theme(
      # reinstate axis.ticks
      axis.ticks = element_line(colour = "grey50"))
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

#' Figure 3.5: EQ-5D values: smoothed lines and confidence intervals by groupvar
#' 
#' @param df Data frame with the EQ-5D columns
#' @param names_eq5d Character vector of column names for the EQ-5D dimensions
#' @param name_vas Character string for the VAS column
#' @param eq5d_version Version of the EQ-5D instrument
#' @param country A character string representing the name of the country. 
#' This could be in a 2-letter format, full name or short name, as specified in the country_codes datasets.
#' @return Summary plot and data used for plotting
#' @export
#' @examples
#' tmp <- figure_3_5(
#'    example_data,
#'    names_eq5d = c("mo", "sc", "ua", "pd", "ad"),
#'    name_vas = "vas",
#'    eq5d_version = "3L",
#'    country = "UK"
#'  )
#' tmp$p
#' tmp$plot_data

figure_3_5 <- function(df,
                        names_eq5d = NULL,
                        name_vas = NULL,
                        eq5d_version = NULL,
                        country) {
  
  ### data preparation ###
  
  # replace NULL names with defaults
  temp <- .get_names(names_eq5d = names_eq5d, name_vas = name_vas, eq5d_version = eq5d_version)
  names_eq5d <- temp$names_eq5d
  name_vas <- temp$name_vas
  eq5d_version <- temp$eq5d_version
  # check existence of columns 
  names_all <- c(names_eq5d, name_vas)
  if (!all(names_all %in% colnames(df)))
    stop("Provided column names not in dataframe. Stopping.")
  # all columns defined and exist; only leave relevant columns now
  df <- df[, names_all, drop = FALSE]
  # further checks and data preparation
  df <- .prep_vas(df = df, name = name_vas)
  df <- .prep_eq5d(df = df, names = names_eq5d,
                   add_state = TRUE,
                   add_utility = TRUE, eq5d_version = eq5d_version, country = country)
  df <- df[, c("vas", "utility"), drop = FALSE]

  ### analysis ###

  # prepare data for plotting
  plot_data <- df[order(df$vas, df$utility), , drop = FALSE]
  
  # plot
  p <- ggplot(plot_data, aes(x = .data$vas, y = .data$utility)) +
    geom_point(colour = "blue") + 
    scale_x_continuous(name = "EQ VAS") +
    scale_y_continuous(name = "EQ-5D value") + 
    ggtitle("EQ-5D values plotted against EQ VAS")
  
  return(list(plot_data = plot_data, p = .modify_ggplot_theme(p = p)))
}

Try the eq5dsuite package in your browser

Any scripts or data that you put into this service are public.

eq5dsuite documentation built on April 11, 2026, 5:06 p.m.