Nothing
#' Dropout Curve and Observation Distribution for Irregular Longitudinal Data
#'
#' This function generates a combined plot of a dropout curve and a histogram of observation counts over time.
#' The dropout curve shows how many subjects remain in the study over time based on their last observation time.
#' The histogram shows how the observations are distributed across time.
#'
#' @param data A data frame containing the longitudinal data.
#' @param id_col A character string specifying the column name for subject identifiers.
#' @param time_col A character string specifying the column name for the time variable.
#' @param bins Number of bins for the histogram (default is 100).
#' @param percentile A numeric value between 0 and 100 specifying the cutoff for the red dropout line (default is 90).
#'
#' @return A list with two elements:
#' \itemize{
#' \item \code{plot}: A ggplot object showing the dropout curve and histogram.
#' \item \code{data}: A data frame with mid-points of the time bins (`mid_time`) and the number of observations (`count`) in descending order.
#' }
#'
#' @import dplyr
#' @import ggplot2
#' @import scales
#' @importFrom rlang sym
#'
#' @examples
#' \dontrun{
#' data(smocc) # assumes smocc is loaded with columns id and age
#' result <- dropplot(data = smocc, id_col = "id", time_col = "age", bins = 60, percentile = 90)
#' print(result$plot)
#' head(result$data)
#' }
#'
#' @export
dropplot <- function(data, id_col, time_col, bins = 100, percentile = 90) {
id_sym <- rlang::sym(id_col)
time_sym <- rlang::sym(time_col)
# Dropout curve
last_obs <- data %>%
dplyr::group_by(!!id_sym) %>%
dplyr::summarize(last_time = max(!!time_sym), .groups = "drop_last")
time_grid <- seq(0, max(data[[time_col]], na.rm = TRUE), length.out = 1000)
dropout_curve <- data.frame(
time = time_grid,
n_subjects = sapply(time_grid, function(t) sum(last_obs$last_time >= t))
)
# Custom cutoff
total_subjects <- nrow(last_obs)
remaining_subjects_cutoff <- ceiling((100 - percentile) / 100 * total_subjects)
cutoff_time <- min(dropout_curve$time[dropout_curve$n_subjects <= remaining_subjects_cutoff])
# Histogram
breaks <- seq(min(data[[time_col]], na.rm = TRUE), max(data[[time_col]], na.rm = TRUE), length.out = bins + 1)
hist_data <- suppressWarnings({
data %>%
dplyr::mutate(bin = cut(!!time_sym, breaks = breaks, include.lowest = TRUE)) %>%
dplyr::count(bin) %>%
dplyr::mutate(
lower = as.numeric(sub("\\((.+),.*", "\\1", bin)),
upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", bin)),
bin_mid = (lower + upper) / 2
) %>%
dplyr::filter(!is.na(bin_mid))
})
bin_width <- diff(breaks)[1] * 0.9
# Plot
p <- ggplot2::ggplot() +
ggplot2::geom_bar(data = hist_data, aes(x = bin_mid, y = n),
stat = "identity", fill = "blue", alpha = 0.4, width = bin_width) +
ggplot2::geom_line(data = dropout_curve, aes(x = time, y = n_subjects),
color = "darkblue", size = 0.8) +
ggplot2::geom_hline(yintercept = remaining_subjects_cutoff, color = "red", linetype = "dashed", size = 1) +
ggplot2::annotate("text", x = max(data[[time_col]], na.rm = TRUE), y = remaining_subjects_cutoff,
label = paste0(percentile, "% dropout (", remaining_subjects_cutoff, " ", id_col, " remaining)"),
hjust = 1.1, vjust = -0.5, color = "red", fontface = "bold", size = 4) +
ggplot2::scale_y_continuous(
name = "Number of subjects or observations",
breaks = scales::pretty_breaks(n = 5)
) +
ggplot2::scale_x_continuous(
name = time_col,
breaks = scales::pretty_breaks(n = 5)
) +
ggplot2::ggtitle("Dropout Curve and Observation Distribution for Irregular Longitudinal Data") +
ggplot2::geom_hline(yintercept = 0, color = "black", size = 1) +
ggplot2::geom_vline(xintercept = 0, color = "black", size = 1) +
ggplot2::theme_minimal() +
ggplot2::theme(
legend.position = "none",
panel.grid.major = ggplot2::element_line(color = "grey85", size = 0.3),
panel.grid.minor = ggplot2::element_line(color = "grey85", size = 0.1),
axis.line = ggplot2::element_blank(),
panel.border = ggplot2::element_blank(),
plot.title = ggplot2::element_text(face = "bold", size = 14),
axis.title = ggplot2::element_text(face = "bold", size = 12),
axis.text = ggplot2::element_text(face = "bold", size = 14)
)
result_df <- hist_data %>%
dplyr::select(mid_time = bin_mid, count = n) %>%
dplyr::arrange(desc(count))
return(list(plot = p, data = result_df))
}
utils::globalVariables(c('bin','lower','upper','bin_mid','n_subjects'))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.