R/get_km.R
In easysurv: Simplify Survival Data Analysis and Model Fitting

Documented in get_km print.get_km

#' Generate Kaplan-Meier estimates
#'
#' Calculates Kaplan-Meier estimates for survival data and returns summary
#' statistics, plots, and additional outputs.
#'
#' @param data A data frame containing the survival data.
#' @param time The name of the column in \code{data} containing the
#'   time-to-event information.
#' @param event The name of the column in \code{data} indicating whether the
#'   event of interest occurred.
#' @param group (Optional) The name of the column in \code{data} defining the
#'   grouping variable. Default is \code{NULL}.
#' @param group_labels Optional character vector containing the names of
#'   the strata (default is NULL). Provide in a consistent order with
#'   \code{levels(as.factor(data$group))}.
#' @param just_km Logical. If \code{TRUE}, only the Kaplan-Meier estimates are
#'   returned. Default is \code{FALSE}.
#' @param ... (Optional) Parameters to pass to ggsurvfit.
#'
#' @returns A list containing Kaplan-Meier estimates, summary statistics, and
#'   plots.
#'
#' @export
#'
#' @importFrom cli cli_abort
#' @importFrom dplyr arrange
#' @importFrom ggsurvfit survfit2
#' @importFrom survival Surv survfit
#' @importFrom stats as.formula
#' @importFrom tidyr nest
#'
#' @examples
#' km_results <- get_km(
#'   data = easysurv::easy_bc,
#'   time = "recyrs",
#'   event = "censrec",
#'   group = "group",
#'   risktable_symbols = FALSE
#' )
#'
#' km_results
get_km <- function(data,
                   time,
                   event,
                   group = NULL,
                   group_labels = NULL,
                   just_km = FALSE,
                   ...) {
  # Validate argument inputs ----
  if (!is.data.frame(data)) {
    cli::cli_abort(c(
      "The {.var data} argument must have class {.cls data.frame}.",
      "x" = "You've provided an object of class: {.cls {class(data)}}"
    ))
  }

  # Are the required columns present?
  # note if group is NULL, it is dropped.
  required_cols <- c(time, event, group)

  if (!all(required_cols %in% names(data))) {
    cli::cli_abort(
      paste0(
        "The following columns are missing from `data`: ",
        paste0(required_cols[!required_cols %in% names(data)], collapse = ", ")
      )
    )
  }

  if (is.null(group)) {
    km_covariate <- 1
  } else {
    group_list <- if (is.null(group_labels)) {
      levels(droplevels(as.factor(data[[group]])))
    } else {
      group_labels
    }

    km_covariate <- group
  }

  km_formula <- stats::as.formula(paste0(
    "survival::Surv(time = ",
    time,
    ", event = ",
    event,
    ") ~",
    km_covariate
  ))

  # ggsurvfit::survfit2 as an alternative to survival::survfit
  # it returns the calling environment, which means it removes things like
  # group=Male from the plot output, and keeps the Time label.
  # Revisit if this turns out to be a problem.
  km <- ggsurvfit::survfit2(
    formula = km_formula,
    conf.int = 0.95,
    data = data,
    type = "kaplan-meier"
  )

  if (just_km) {
    return(km)
  }

  km_formula_separate <- stats::as.formula(paste0(
    "survival::Surv(time = ",
    time,
    ", event = ",
    event,
    ") ~",
    1
  ))

  km_all <- survival::survfit(
    formula = km_formula_separate,
    conf.int = 0.95,
    data = data,
    type = "kaplan-meier"
  )

  km_for_excel <- list(all = step_km(km_all))

  if (!is.null(group)) {
    if (!is.factor(data[[group]])) {
      data[[group]] <- as.factor(data[[group]])
    }

    nested <- data |>
      dplyr::arrange(factor(group, levels = levels(group))) |>
      tidyr::nest(.by = all_of(group))

    km_per_group <- lapply(
      purrr::set_names(nested$data, group_list),
      function(data) {
        surv_out <- survival::survfit(
          formula = km_formula_separate,
          conf.int = 0.95,
          data = data,
          type = "kaplan-meier"
        )

        return(surv_out)
      }
    )

    km_for_excel <- c(
      km_for_excel,
      lapply(
        purrr::set_names(km_per_group, group_list),
        function(x) {
          step_km(x)
        }
      )
    )

    km_plot <- plot_km(
      km,
      ...
    )

    km_summary <- summarise_km(km,
      strata_labels = group_list
    )

    rownames(km_summary) <- group_list
  } else {
    km_per_group <- NULL

    km_plot <- plot_km(
      km,
      ...
    )

    km_summary <- summarise_km(km)
  }

  km_median_follow_up <- get_median_fu(
    data = data,
    time = time,
    event = event,
    group = group
  )

  km_summary <- cbind(km_summary, km_median_follow_up)

  if (!is.null(group)) rownames(km_summary) <- group_list

  # Return ----
  out <- list(
    km = km,
    km_for_excel = km_for_excel,
    km_per_group = km_per_group,
    km_plot = km_plot,
    km_summary = km_summary
  )

  # Assign a class
  class(out) <- c(class(out), "get_km")

  out
}


#' Print methods for \code{get_km()}
#'
#' @param x An object of class \code{get_km}
#' @param ... Additional arguments
#'
#' @returns The summary of the Kaplan-Meier estimates, printed via the console.
#'
#' @importFrom cli cli_h1 cli_h2 cli_h3 cli_text
#' @importFrom cli cli_ul cli_li cli_div cli_end
#' @importFrom cli cli_alert cli_alert_info cli_alert_warning cli_rule
#' @importFrom cli cat_line qty
#'
#' @export
#'
#' @examples
#' km_results <- get_km(
#'   data = easysurv::easy_bc,
#'   time = "recyrs",
#'   event = "censrec",
#'   group = "group",
#'   risktable_symbols = FALSE
#' )
#'
#' print(km_results)
print.get_km <- function(x, ...) {
  cli::cli_h1("Kaplan-Meier Data")
  cli::cli_text("The get_km function has produced the following outputs:")
  cli::cli_ul()
  cli::cli_li(paste0(
    "{.strong km}: A {.fn survival::survfit} object for ",
    "Kaplan-Meier estimates."
  ))
  cli::cli_li(paste0(
    "{.strong km_for_excel}: A list of stepped Kaplan-Meier ",
    "data for external plotting."
  ))
  cli::cli_li(paste0(
    "{.strong km_per_group}: A list of Kaplan-Meier ",
    "estimates for each group."
  ))
  cli::cli_li("{.strong km_plot}: A Kaplan-Meier plot.")
  cli::cli_li(paste0(
    "{.strong km_summary}: A summary table of the ",
    "Kaplan-Meier estimates."
  ))
  cli::cli_end()

  cli::cli_h2("km Summary")
  print(x$km_summary)

  cli::cli_par() # As cat_line didn't seem to work.
  cli::cat_line()
  cli::cli_end()

  print(x$km_plot)
  cli::cli_text("{.val km_plot} has been printed.")
  cli::cli_rule()
  cli::cli_alert(c(
    "For more information, run {.code View()} ",
    "on saved {.fn get_km} output."
  ))

  invisible(x)
}


# Helper functions ----

#' Calculate the median follow-up time (or median time to censoring).
#'
#' Reverses the censoring/event variable to derive median follow-up time.
#'
#' @param data A tibble or data frame containing the survival data with
#'   columns for \code{time}, \code{event} and \code{group}.
#' @param time The name of the time variable in data
#' @param event The name of the event variable in data
#' @param group The name of the group variable in data
#'
#' @returns A data.frame containing median follow-up times.
#'
#' @importFrom ggsurvfit survfit2
#' @importFrom survival Surv
#' @importFrom stats quantile
#' @importFrom stats as.formula
#'
#' @noRd
get_median_fu <- function(data,
                          time,
                          event,
                          group = NULL) {
  # Define the Surv object
  the_surv <- survival::Surv(time = data[[time]], event = data[[event]])

  # Inverse censoring/events
  inverse_surv <- the_surv
  inverse_surv[, 2] <- 1 - inverse_surv[, 2]

  # Find median quantile
  if (is.null(group)) {
    quantiles <- stats::quantile(
      ggsurvfit::survfit2(
        stats::as.formula(
          paste0("inverse_surv ~ 1")
        ),
        data = data
      ),
      0.5
    )
  } else {
    quantiles <- stats::quantile(
      ggsurvfit::survfit2(
        stats::as.formula(
          paste0("inverse_surv ~ as.factor(", group, ")")
        ),
        data = data
      ),
      0.5
    )
  }

  # Do not keep CI.
  # as.data.frame is used in case of single group
  out <- as.data.frame(quantiles$quantile)
  colnames(out) <- "Median follow-up"

  out
}


#' Tabulate km data in a 'stepped' manner for external plotting
#'
#' This function takes a survival object generated from a call to
#' \code{\link[survival]{survfit}} and creates a table in a 'stepped' format
#' suitable for external plotting, such as in Excel.
#'
#' @param km A survival object generated from a call to
#'  \code{\link[survival]{survfit}}.
#'  The `km` object should be of class "survfit" and should represent a
#'  single-arm survival analysis (e.g. \code{Surv(time, status) ~ 1}).
#'
#' @returns A data frame containing the 'stepped' tabulated survival data
#' suitable for external plotting or further analysis.
#' The table includes information about the time points, number at risk,
#' survival probability, and the number of events for each time point.
#'
#' @importFrom survival survfit0
#' @importFrom tibble tibble
#'
#' @noRd
step_km <- function(km) {
  # add the first 1 , 0.
  km <- survival::survfit0(km, start.time = 0)

  km_sum <- summary(km, times = km$time)

  time <- rep(km_sum$time, each = 2)[-1]
  nrisk <- c(
    rep(km_sum$n.risk[-length(km_sum$n.risk)], each = 2),
    min(km_sum$n.risk)
  )
  survival <- c(
    rep(km_sum$surv[-length(km_sum$surv)], each = 2),
    min(km_sum$surv)
  )

  count <- rep(rev(seq_along(km_sum$time)), each = 2)

  out <- tibble::tibble(
    time = time,
    nrisk = nrisk,
    survival = survival,
    count = count[-length(count)]
  )

  out <- out[order(out$time, -out$survival, -out$nrisk), ]

  out
}


#' Summarize the data in a survfit object
#'
#' This function takes a survival object generated from a call to
#' \code{\link[survival]{survfit}} and produces a concise summary table.
#'
#' @param fit A survival object generated from a call to
#'   \code{\link[survival]{survfit}}.
#' @param strata_labels Optional parameter to rename the group column
#'   (if multiple strata)
#'
#' @returns A data frame summarizing the data in the survival object. The table
#' includes information about the strata, number of observations,
#' number of events, restricted mean survival times, median survival times,
#' and corresponding confidence intervals.
#'
#' @importFrom tibble rownames_to_column
#' @importFrom dplyr select
#'
#' @noRd
summarise_km <- function(fit, strata_labels = NULL) {
  if ("strata" %in% names(fit)) {
    out <- as.data.frame(summary(fit)$table) |>
      tibble::rownames_to_column(var = "group")

    if (!is.null(strata_labels)) {
      out$group <- strata_labels
    }
  } else {
    out <- as.data.frame(t(summary(fit)$table))
  }

  out <- out |> dplyr::select(-"n.max", -"n.start")

  out
}