R/preptdc.R
In trtswitch: Treatment Switching

Documented in preptdc

#' @title Prepare Survival Data With Time-Dependent Covariates
#'
#' @description
#' This function prepares a counting-process style survival dataset
#' for analyses with time-dependent covariates. It merges baseline
#' and longitudinal data, fills in missing covariate values using
#' last-observation-carried-forward (LOCF), restricts to time points
#' where covariates change (optional), and constructs `tstart`, `tstop`,
#' and `event` variables suitable for use in survival models.
#'
#' @param adsl A data set containing baseline subject-level information. 
#'   It should include, at a minimum, subject ID (`id`), 
#'   randomization date (`randdt`), treatment start date (`trtsdt`),
#'   survival outcome (`osdt`, `died`), progression date (`pddt`), 
#'   treatment switch date (`xodt`), and data cut-off date (`dcutdt`).
#' @param adtdc A data set containing longitudinal
#'   time-dependent covariate data, with subject ID (`id`),
#'   parameter code (`paramcd`), analysis date (`adt`), and covariate
#'   value (`aval`).
#' @param id Character string specifying the column name for subject ID.
#' @param randdt Character string specifying the column name for 
#'   randomization date.
#' @param trtsdt Character string specifying the column name for 
#'   treatment start date.
#' @param pddt Character string specifying the column name for progression
#'   date.
#' @param xodt Character string specifying the column name for treatment
#'   crossover/switch date.
#' @param osdt Character string specifying the column name for overall
#'   survival date (death date or last known alive date).
#' @param died Character string specifying the column name for death
#'   indicator (0 = alive/censored, 1 = died).
#' @param dcutdt Character string specifying the column name for data
#'   cut-off date.
#' @param adt Character string specifying the column name for analysis
#'   date in the time-dependent covariate dataset.
#' @param paramcd Character string specifying the column name for parameter
#'   code (identifying different covariates).
#' @param aval Character string specifying the column name for analysis
#'   value (covariate values).
#' @param nodup Logical; if `TRUE` (default), only rows where at least
#'   one covariate changes compared to the previous row (within each subject)
#'   are retained, along with the first row per subject (baseline).
#' @param offset Logical; if `TRUE` (default), add 1-day offset when 
#'   computing analysis day variables (`ady`, `osdy`, etc.).
#'
#' @details
#' The function performs the following steps:
#' \enumerate{
#'   \item Merge `adsl` and `adtdc` to obtain randomization date and 
#'         treatment start date.
#'   \item Define `adt2` as `adt` if `adt > trtsdt`, 
#'         and `randdt` if `adt <= trtsdt` (i.e., baseline time point).
#'         This ensures that the baseline covariate value is the last 
#'         non-missing value at or before the treatment start date.
#'         Post-baseline covariate values are anchored at their actual
#'         analysis dates.
#'   \item Keep the last record per subject, `adt2`, and `paramcd`.
#'   \item Construct a complete skeleton so all covariates are present
#'         for each subject and time point.
#'   \item Fill missing covariate values using LOCF.
#'   \item Pivot to wide format with one row per subject and time point.
#'   \item Optionally drop rows without covariate changes (`nodup = TRUE`).
#'   \item Merge survival outcomes from `adsl`.
#'   \item Compute time-to-event variables (`ady`, `osdy`, etc.), as well as
#'         counting-process style variables `tstart`, `tstop`, and `event`.
#' }
#'
#' @return A data set with one row per subject and time interval, including:
#' \itemize{
#'   \item `tstart`, `tstop` — interval start and stop times 
#'         (days from randomization).
#'   \item `event` — event indicator (0/1).
#'   \item Covariates expanded to wide format.
#'   \item Auxiliary variables such as progression indicator (`pd`),
#'         treatment switch indicator (`swtrt`), and administrative 
#'         censoring time.
#' }
#'
#' @examples
#' 
#' surv_data <- preptdc(adsl, adtdc, nodup = TRUE)
#' head(surv_data)
#' 
#' @export
preptdc <- function(adsl, adtdc, id = "SUBJID", randdt = "RANDDT", 
                    trtsdt = "TRTSDT", pddt = "PDDT", xodt = "XODT", 
                    osdt = "OSDT", died = "DIED", 
                    dcutdt = "DCUTDT", adt = "ADT", 
                    paramcd = "PARAMCD", aval = "AVAL",
                    nodup = TRUE, offset = TRUE) {
  
  # convert input data into data.table
  data.table::setDT(adsl)
  data.table::setDT(adtdc)
  
  # verify whether the input data have required columns
  cols <- colnames(adsl)
  req_cols <- c(id, randdt, osdt, died, pddt, xodt, dcutdt)
  if (!all(req_cols %in% cols)) {
    stop(paste("The following columns are missing from adsl:", 
               paste(req_cols[!(req_cols %in% cols)], collapse = ", ")))
  }
  
  cols <- colnames(adtdc)
  req_cols <- c(id, adt, paramcd, aval)
  if (!all(req_cols %in% cols)) {
    stop(paste("The following columns are missing from adtdc:", 
               paste(req_cols[!(req_cols %in% cols)], collapse = ", ")))
  }
  
  # obtain randdt and trtsdt
  cols <- c(id, randdt, trtsdt)
  data1 <- merge(adtdc, adsl[, mget(cols)], by = id)
  
  # define adt2 to differentiate baseline and post-baseline time points
  data1[, `:=`(adt2 = data.table::fifelse(get(adt) <= get(trtsdt), 
                                          get(randdt), get(adt)))]
  
  # keep last record with each subject, adt2, and paramcd
  data.table::setorderv(data1, c(id, "adt2", paramcd, adt))
  data1 <- data1[, .SD[.N], by = c(id, "adt2", paramcd)]
  
  # build skeleton
  cols <- c(id, randdt, "adt2")
  pars <- unique(data1[[paramcd]])
  data2 <- merge(
    unique(data1[, mget(cols)])[, `:=`(dummy = 1)],
    data.table::data.table(temp_col = pars, dummy = 1),
    by = "dummy", allow.cartesian = TRUE)
  data.table::setnames(data2, "temp_col", paramcd)
  data2[, `:=`(dummy = NULL)]
  
  # right join
  data3 <- merge(data1, data2, 
                 by = c(id, randdt, "adt2", paramcd), 
                 all.y = TRUE)
  
  # LOCF
  data.table::setorderv(data3, c(id, paramcd, "adt2"))
  data3[, `:=`(temp_col = data.table::nafill(get(aval), type = "locf")), 
        by = c(id, paramcd)]
  data3[[aval]] <- NULL
  data.table::setnames(data3, "temp_col", aval)
  
  # wide format
  fml <- paste(paste(c(id, randdt, "adt2"), collapse = " + "), "~", paramcd)
  data4 <- data.table::dcast(data3, fml, value.var = aval)
  
  # de-dup
  if (nodup) {
    # only keep rows where at least one paramcd changes value
    data4[, `:=`(change = rowSums(
      .SD != data.table::shift(.SD, type = "lag"), na.rm = TRUE) > 0),
      by = id, .SDcols = pars]
    
    # keep only change rows (plus the very first row per subject 
    # to establish baseline)
    data4 <- data4[get("change") | !duplicated(get(id))]
    data4[, `:=`(change = NULL)]   # drop helper column if not needed
  }
  
  # merge survival info
  data5 <- merge(data4, adsl, by = c(id, randdt))
  data5[, `:=`(ady = as.integer(difftime(get("adt2"), get(randdt), 
                                         units = "days") + offset), 
               osdy = as.integer(difftime(get(osdt), get(randdt), 
                                          units = "days")) + offset)]
  
  # create tstart and tstop relative to randdt
  # time-dependent covariate values at tstart
  # last interval ends with osdt     
  data5[, `:=`(tstart = get("ady"))]
  data5[, `:=`(tstop = data.table::shift(get("ady"), type = "lead")), 
        by = id]
  data5[, `:=`(tstop = data.table::fifelse(is.na(get("tstop")), 
                                           get("osdy"), get("tstop")))]
  
  # create event
  data5[, `:=`(event = data.table::fifelse(
    seq_len(.N) == .N, get(died), 0L)), by = id]
  
  # set up pd, swtrt, and censoring time
  data5[, `:=`(pd = !is.na(get(pddt)), 
               pd_time = as.integer(difftime(
                 get(pddt), get(randdt), units = "days")) + offset,
               swtrt = !is.na(get(xodt)),
               swtrt_time = as.integer(difftime(
                 get(xodt), get(randdt), units = "days")) + offset, 
               censor_time = as.integer(difftime(
                 get(dcutdt), get(randdt), units = "days")) + offset)]
  
  as.data.frame(data5)
}