R/dropplot.R

Defines functions dropplot

Documented in dropplot

#' Dropout Curve and Observation Distribution for Irregular Longitudinal Data
#'
#' This function generates a combined plot of a dropout curve and a histogram of observation counts over time.
#' The dropout curve shows how many subjects remain in the study over time based on their last observation time.
#' The histogram shows how the observations are distributed across time.
#'
#' @param data A data frame containing the longitudinal data.
#' @param id_col A character string specifying the column name for subject identifiers.
#' @param time_col A character string specifying the column name for the time variable.
#' @param bins Number of bins for the histogram (default is 100).
#' @param percentile A numeric value between 0 and 100 specifying the cutoff for the red dropout line (default is 90).
#'
#' @return A list with two elements:
#' \itemize{
#'   \item \code{plot}: A ggplot object showing the dropout curve and histogram.
#'   \item \code{data}: A data frame with mid-points of the time bins (`mid_time`) and the number of observations (`count`) in descending order.
#' }
#'
#' @import dplyr
#' @import ggplot2
#' @import scales
#' @importFrom rlang sym
#'
#' @examples
#' \dontrun{
#'   data(smocc)  # assumes smocc is loaded with columns id and age
#'   result <- dropplot(data = smocc, id_col = "id", time_col = "age", bins = 60, percentile = 90)
#'   print(result$plot)
#'   head(result$data)
#' }
#'
#' @export
dropplot <- function(data, id_col, time_col, bins = 100, percentile = 90) {
  id_sym <- rlang::sym(id_col)
  time_sym <- rlang::sym(time_col)

  # Dropout curve
  last_obs <- data %>%
    dplyr::group_by(!!id_sym) %>%
    dplyr::summarize(last_time = max(!!time_sym), .groups = "drop_last")

  time_grid <- seq(0, max(data[[time_col]], na.rm = TRUE), length.out = 1000)
  dropout_curve <- data.frame(
    time = time_grid,
    n_subjects = sapply(time_grid, function(t) sum(last_obs$last_time >= t))
  )

  # Custom cutoff
  total_subjects <- nrow(last_obs)
  remaining_subjects_cutoff <- ceiling((100 - percentile) / 100 * total_subjects)
  cutoff_time <- min(dropout_curve$time[dropout_curve$n_subjects <= remaining_subjects_cutoff])

  # Histogram
  breaks <- seq(min(data[[time_col]], na.rm = TRUE), max(data[[time_col]], na.rm = TRUE), length.out = bins + 1)
  hist_data <- suppressWarnings({
    data %>%
      dplyr::mutate(bin = cut(!!time_sym, breaks = breaks, include.lowest = TRUE)) %>%
      dplyr::count(bin) %>%
      dplyr::mutate(
        lower = as.numeric(sub("\\((.+),.*", "\\1", bin)),
        upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", bin)),
        bin_mid = (lower + upper) / 2
      ) %>%
      dplyr::filter(!is.na(bin_mid))
  })

  bin_width <- diff(breaks)[1] * 0.9

  # Plot
  p <- ggplot2::ggplot() +
    ggplot2::geom_bar(data = hist_data, aes(x = bin_mid, y = n),
                      stat = "identity", fill = "blue", alpha = 0.4, width = bin_width) +
    ggplot2::geom_line(data = dropout_curve, aes(x = time, y = n_subjects),
                       color = "darkblue", size = 0.8) +
    ggplot2::geom_hline(yintercept = remaining_subjects_cutoff, color = "red", linetype = "dashed", size = 1) +
    ggplot2::annotate("text", x = max(data[[time_col]], na.rm = TRUE), y = remaining_subjects_cutoff,
                      label = paste0(percentile, "% dropout (", remaining_subjects_cutoff, " ", id_col, " remaining)"),
                      hjust = 1.1, vjust = -0.5, color = "red", fontface = "bold", size = 4) +
    ggplot2::scale_y_continuous(
      name = "Number of subjects or observations",
      breaks = scales::pretty_breaks(n = 5)
    ) +
    ggplot2::scale_x_continuous(
      name = time_col,
      breaks = scales::pretty_breaks(n = 5)
    ) +
    ggplot2::ggtitle("Dropout Curve and Observation Distribution for Irregular Longitudinal Data") +
    ggplot2::geom_hline(yintercept = 0, color = "black", size = 1) +
    ggplot2::geom_vline(xintercept = 0, color = "black", size = 1) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      legend.position = "none",
      panel.grid.major = ggplot2::element_line(color = "grey85", size = 0.3),
      panel.grid.minor = ggplot2::element_line(color = "grey85", size = 0.1),
      axis.line = ggplot2::element_blank(),
      panel.border = ggplot2::element_blank(),
      plot.title = ggplot2::element_text(face = "bold", size = 14),
      axis.title = ggplot2::element_text(face = "bold", size = 12),
      axis.text = ggplot2::element_text(face = "bold", size = 14)
    )

  result_df <- hist_data %>%
    dplyr::select(mid_time = bin_mid, count = n) %>%
    dplyr::arrange(desc(count))

  return(list(plot = p, data = result_df))
}

utils::globalVariables(c('bin','lower','upper','bin_mid','n_subjects'))

Try the ILRCM package in your browser

Any scripts or data that you put into this service are public.

ILRCM documentation built on Aug. 12, 2025, 1:08 a.m.