admiral: ADaM in R Asset Library

Documented in create_period_dataset derive_vars_period

#' Create a Reference Dataset for Subperiods, Periods, or Phases
#'
#' The function creates a reference dataset for subperiods, periods, or phases
#' from the `ADSL` dataset. The reference dataset can be used to derive
#' subperiod, period, or phase variables like `ASPER`, `ASPRSDT`, `ASPREDT`,
#' `APERIOD`, `APERSDT`, `APEREDT`, `TRTA`, `APHASEN`, `PHSDTM`, `PHEDTM`, ...
#' in OCCDS and BDS datasets.
#'
#' @param dataset
#' `r roxygen_param_dataset(expected_vars = c("new_vars", "subject_keys"))`
#' For each element of `new_vars` at least one variable of the form of the right
#' hand side value must be available in the dataset.
#'
#' @param new_vars New variables
#'
#'   A named list of variables like `exprs(PHSDT = PHwSDT, PHEDT = PHwEDT, APHASE
#'   = APHASEw)` is expected. The left hand side of the elements defines a
#'   variable of the output dataset, the right hand side defines the source
#'   variables from the ADSL dataset in CDISC notation.
#'
#'   If the lower case letter "w"  is used it refers to a phase variable, if the
#'   lower case letters "xx" are used it refers to a period variable, and if
#'   both "xx" and "w" are used it refers to a subperiod variable.
#'
#'   Only one type must be used, e.g., all right hand side values must refer to
#'   period variables. It is not allowed to mix for example period and subperiod
#'   variables. If period *and* subperiod variables are required, separate
#'   reference datasets must be created.
#'
#' @param subject_keys Variables to uniquely identify a subject
#'
#'   A list of expressions where the expressions are symbols as returned by
#'   `exprs()` is expected.
#'
#'
#' @details For each subject and each subperiod/period/phase where at least one
#'   of the source variable is not `NA` an observation is added to the output
#'   dataset.
#'
#'   Depending on the type of the source variable (subperiod, period, or phase)
#'   the variable `ASPER`, `APERIOD`, or `APHASEN` is added and set to the
#'   number of the subperiod, period, or phase.
#'
#'   The variables specified for `new_vars` (left hand side) are added to the
#'   output dataset and set to the value of the source variable (right hand
#'   side).
#'
#' @return A period reference dataset (see "Details" section)
#'
#' @seealso [derive_vars_period()]
#'
#' @keywords create_aux
#' @family create_aux
#'
#' @export
#'
#' @examples
#' library(tibble)
#' library(dplyr, warn.conflicts = FALSE)
#' library(lubridate)
#'
#' # Create reference dataset for periods
#' adsl <- tribble(
#'   ~USUBJID, ~AP01SDT,     ~AP01EDT,     ~AP02SDT,     ~AP02EDT,     ~TRT01A, ~TRT02A,
#'   "1",      "2021-01-04", "2021-02-06", "2021-02-07", "2021-03-07", "A",     "B",
#'   "2",      "2021-02-02", "2021-03-02", "2021-03-03", "2021-04-01", "B",     "A",
#' ) %>%
#'   mutate(
#'     across(matches("AP\\d\\d[ES]DT"), ymd)
#'   ) %>%
#'   mutate(
#'     STUDYID = "xyz"
#'   )
#'
#' create_period_dataset(
#'   adsl,
#'   new_vars = exprs(APERSDT = APxxSDT, APEREDT = APxxEDT, TRTA = TRTxxA)
#' )
#'
#' # Create reference dataset for phases
#' adsl <- tribble(
#'   ~USUBJID, ~PH1SDT,      ~PH1EDT,      ~PH2SDT,      ~PH2EDT,      ~APHASE1,    ~APHASE2,
#'   "1",      "2021-01-04", "2021-02-06", "2021-02-07", "2021-03-07", "TREATMENT", "FUP",
#'   "2",      "2021-02-02", "2021-03-02", NA,           NA,           "TREATMENT", NA
#' ) %>%
#'   mutate(
#'     across(matches("PH\\d[ES]DT"), ymd)
#'   ) %>%
#'   mutate(
#'     STUDYID = "xyz"
#'   )
#'
#' create_period_dataset(
#'   adsl,
#'   new_vars = exprs(PHSDT = PHwSDT, PHEDT = PHwEDT, APHASE = APHASEw)
#' )
#'
#' # Create reference datasets for subperiods
#' adsl <- tribble(
#'   ~USUBJID, ~P01S1SDT,    ~P01S1EDT,    ~P01S2SDT,    ~P01S2EDT,    ~P02S1SDT,    ~P02S1EDT,
#'   "1",      "2021-01-04", "2021-01-19", "2021-01-20", "2021-02-06", "2021-02-07", "2021-03-07",
#'   "2",      "2021-02-02", "2021-03-02", NA,           NA,           "2021-03-03", "2021-04-01"
#' ) %>%
#'   mutate(
#'     across(matches("P\\d\\dS\\d[ES]DT"), ymd)
#'   ) %>%
#'   mutate(
#'     STUDYID = "xyz"
#'   )
#'
#' create_period_dataset(
#'   adsl,
#'   new_vars = exprs(ASPRSDT = PxxSwSDT, ASPREDT = PxxSwEDT)
#' )
create_period_dataset <- function(dataset,
                                  new_vars,
                                  subject_keys = get_admiral_option("subject_keys")) {
  assert_vars(new_vars, expect_names = TRUE)
  assert_vars(subject_keys)
  assert_data_frame(dataset, required_vars = subject_keys)

  new_vars_names <- names(new_vars)
  new_vars_chr <- vars2chr(new_vars)
  cols <- str_replace(new_vars_chr, "xx", "\\\\d\\\\d") %>%
    str_replace("w", "\\\\d")
  names_pattern <- str_replace(new_vars_chr, "xx", "\\\\d\\\\d") %>%
    str_replace("w", "\\\\d") %>%
    str_replace("(\\w+)\\\\d", "(\\1)\\\\d") %>%
    str_replace_all("((\\\\d)+)", "(\\1)")
  mode <- case_when(
    str_detect(new_vars_chr, "\\w+xx\\w+w\\w*") ~ "subperiod",
    str_detect(new_vars_chr, "\\w+xx\\w*") ~ "period",
    str_detect(new_vars_chr, "\\w+w\\w*") ~ "phase",
    TRUE ~ "none"
  ) %>% unique()
  if (any(mode == "none")) {
    cli_abort(
      c(
        paste0(
          "The right hand side values of `new_vars` have to be CDISC style ",
          "subperiod, period, or phase variables."
        ),
        "I.e., they must contain the xx or w fragment, e.g., APxxSDT, PxxSwSDT, or PHwSDT."
      )
    )
  }
  if (length(mode) > 1) {
    cli_abort(
      discard(
        c(
          "More than one type of subperiod, period, or phase variables ",
          "is specified for `new_vars`:",
          if_else(
            "subperiod" %in% mode,
            "subperiod: {.var {new_vars_chr[mode == \"subperiod\"]}}",
            NA_character_
          ),
          if_else(
            "period" %in% mode,
            "period: {.var {new_vars_chr[mode == \"period\"]}}",
            NA_character_
          ),
          if_else(
            "phase" %in% mode,
            "phase:{.var {new_vars_chr[mode == \"phase\"]}}",
            NA_character_
          )
        ),
        is.na
      )
    )
  }
  prefix <- syms(str_match(cols, "(\\w+)\\\\")[, 2])
  num_var_chr <- c(
    subperiod = "ASPER",
    period = "APERIOD",
    phase = "APHASEN"
  )
  num_var <- syms(num_var_chr)
  period_ref <- vector("list", length(new_vars))
  for (i in seq_along(new_vars)) {
    if (!any(str_detect(colnames(dataset), names_pattern[[i]]))) {
      cli_abort(
        "No variables of the form {.var {new_vars_chr[[i]]}} were found in the input dataset."
      )
    }
    if (mode == "subperiod") {
      period_ref[[i]] <- pivot_longer(
        select(dataset, !!!subject_keys, matches(paste0(cols[[i]], "\\b"))),
        matches(paste0(cols[[i]], "\\b")),
        names_to = c(".value", "APERIOD", num_var_chr[[mode]]),
        names_pattern = names_pattern[[i]]
      ) %>%
        rename(!!sym(new_vars_names[[i]]) := !!prefix[[i]]) %>%
        mutate(
          APERIOD = as.integer(APERIOD),
          !!num_var[[mode]] := as.integer(!!num_var[[mode]])
        ) %>%
        filter(!is.na(!!sym(new_vars_names[[i]])))
      by_vars <- exprs(APERIOD, !!sym(num_var[[mode]]))
    } else {
      period_ref[[i]] <- pivot_longer(
        select(dataset, !!!subject_keys, matches(paste0(cols[[i]], "\\b"))),
        matches(paste0(cols[[i]], "\\b")),
        names_to = c(".value", num_var_chr[[mode]]),
        names_pattern = names_pattern[[i]]
      ) %>%
        rename(!!sym(new_vars_names[[i]]) := !!prefix[[i]]) %>%
        mutate(!!num_var[[mode]] := as.integer(!!num_var[[mode]])) %>%
        filter(!is.na(!!sym(new_vars_names[[i]])))
      by_vars <- exprs(!!sym(num_var[[mode]]))
    }
    if (i == 1) {
      period_ref_final <- period_ref[[1]]
    } else {
      period_ref_final <- derive_vars_merged(
        period_ref_final,
        dataset_add = period_ref[[i]],
        by_vars = expr_c(subject_keys, by_vars)
      )
    }
  }
  period_ref_final
}

#' Add Subperiod, Period, or Phase Variables to ADSL
#'
#' The function adds subperiod, period, or phase variables like `P01S1SDT`,
#' `P01S2SDT`, `AP01SDTM`, `AP02SDTM`, `TRT01A`, `TRT02A`, `PH1SDT`, `PH2SDT`,
#' ... to the input dataset. The values of the variables are defined by a period
#' reference dataset which has one observations per patient and subperiod,
#' period, or phase.
#'
#' @param dataset `r roxygen_param_dataset(expected_vars = c("subject_keys"))`
#'
#' @param dataset_ref Period reference dataset
#'
#'   The variables specified by `new_vars` and `subject_keys` are expected.
#'
#'   If subperiod variables are requested, `APERIOD` and `ASPER` are expected.
#'   If period variables are requested. `APERIOD` is expected. If phase
#'   variables are requested, `APHASEN` is expected.
#'
#' @param new_vars New variables
#'
#'   A named list of variables like `exprs(PHwSDT = PHSDT, PHwEDT = PHEDT,
#'   APHASEw = APHASE)` is expected. The left hand side of the elements defines
#'   a set of variables (in CDISC notation) to be added to the output dataset.
#'   The right hand side defines the source variable from the period reference
#'   dataset.
#'
#'   If the lower case letter "w"  is used it refers to a phase variable, if the
#'   lower case letters "xx" are used it refers to a period variable, and if
#'   both "xx" and "w" are used it refers to a subperiod variable.
#'
#'   Only one type must be used, e.g., all left hand side values must refer to
#'   period variables. It is not allowed to mix for example period and subperiod
#'   variables. If period *and* subperiod variables are required, separate calls
#'   must be used.
#'
#' @param subject_keys Variables to uniquely identify a subject
#'
#'   A list of expressions where the expressions are symbols as returned by
#'   `exprs()` is expected.
#'
#'
#' @details For each subperiod/period/phase in the period reference dataset and
#'   each element in `new_vars` a variable (LHS value of `new_vars`) is added to
#'   the output dataset and set to the value of the source variable (RHS value
#'   of `new_vars`.
#'
#' @return The input dataset with subperiod/period/phase variables added (see
#'   "Details" section)
#'
#' @seealso [create_period_dataset()]
#'
#' @keywords der_adsl
#' @family der_adsl
#'
#' @export
#'
#' @examples
#' library(tibble)
#' library(dplyr, warn.conflicts = FALSE)
#' library(lubridate)
#'
#' adsl <- tibble(STUDYID = "xyz", USUBJID = c("1", "2"))
#'
#' # Add period variables to ADSL
#' period_ref <- tribble(
#'   ~USUBJID, ~APERIOD, ~APERSDT,     ~APEREDT,
#'   "1",             1, "2021-01-04", "2021-02-06",
#'   "1",             2, "2021-02-07", "2021-03-07",
#'   "2",             1, "2021-02-02", "2021-03-02",
#'   "2",             2, "2021-03-03", "2021-04-01"
#' ) %>%
#'   mutate(
#'     STUDYID = "xyz",
#'     APERIOD = as.integer(APERIOD),
#'     across(matches("APER[ES]DT"), ymd)
#'   )
#'
#' derive_vars_period(
#'   adsl,
#'   dataset_ref = period_ref,
#'   new_vars = exprs(APxxSDT = APERSDT, APxxEDT = APEREDT)
#' ) %>%
#'   select(STUDYID, USUBJID, AP01SDT, AP01EDT, AP02SDT, AP02EDT)
#'
#' # Add phase variables to ADSL
#' phase_ref <- tribble(
#'   ~USUBJID, ~APHASEN, ~PHSDT,       ~PHEDT,       ~APHASE,
#'   "1",             1, "2021-01-04", "2021-02-06", "TREATMENT",
#'   "1",             2, "2021-02-07", "2021-03-07", "FUP",
#'   "2",             1, "2021-02-02", "2021-03-02", "TREATMENT"
#' ) %>%
#'   mutate(
#'     STUDYID = "xyz",
#'     APHASEN = as.integer(APHASEN),
#'     across(matches("PH[ES]DT"), ymd)
#'   )
#'
#' derive_vars_period(
#'   adsl,
#'   dataset_ref = phase_ref,
#'   new_vars = exprs(PHwSDT = PHSDT, PHwEDT = PHEDT, APHASEw = APHASE)
#' ) %>%
#'   select(STUDYID, USUBJID, PH1SDT, PH1EDT, PH2SDT, PH2EDT, APHASE1, APHASE2)
#'
#' # Add subperiod variables to ADSL
#' subperiod_ref <- tribble(
#'   ~USUBJID, ~APERIOD, ~ASPER, ~ASPRSDT,     ~ASPREDT,
#'   "1",             1,      1, "2021-01-04", "2021-01-19",
#'   "1",             1,      2, "2021-01-20", "2021-02-06",
#'   "1",             2,      1, "2021-02-07", "2021-03-07",
#'   "2",             1,      1, "2021-02-02", "2021-03-02",
#'   "2",             2,      1, "2021-03-03", "2021-04-01"
#' ) %>%
#'   mutate(
#'     STUDYID = "xyz",
#'     APERIOD = as.integer(APERIOD),
#'     ASPER = as.integer(ASPER),
#'     across(matches("ASPR[ES]DT"), ymd)
#'   )
#'
#' derive_vars_period(
#'   adsl,
#'   dataset_ref = subperiod_ref,
#'   new_vars = exprs(PxxSwSDT = ASPRSDT, PxxSwEDT = ASPREDT)
#' ) %>%
#'   select(STUDYID, USUBJID, P01S1SDT, P01S1EDT, P01S2SDT, P01S2EDT, P02S1SDT, P02S1EDT)
derive_vars_period <- function(dataset,
                               dataset_ref,
                               new_vars,
                               subject_keys = get_admiral_option("subject_keys")) {
  assert_vars(new_vars, expect_names = TRUE)
  assert_vars(subject_keys)
  assert_data_frame(dataset, required_vars = subject_keys)

  new_vars_names <- names(new_vars)
  new_vars_chr <- vars2chr(new_vars)
  mode <- case_when(
    str_detect(new_vars_names, "\\w+xx\\w+w\\w*") ~ "subperiod",
    str_detect(new_vars_names, "\\w+xx\\w*") ~ "period",
    str_detect(new_vars_names, "\\w+w\\w*") ~ "phase",
    TRUE ~ "none"
  ) %>% unique()
  if (any(mode == "none")) {
    cli_abort(
      c(
        paste0(
          "The left hand side values of `new_vars` have to be CDISC style ",
          "subperiod, period, or phase variables."
        ),
        "I.e., they must contain the xx or w fragment, e.g., APxxSDT, PxxSwSDT, or PHwSDT."
      )
    )
  }
  if (length(mode) > 1) {
    cli_abort(
      discard(
        c(
          "More than one type of subperiod, period, or phase variables ",
          "is specified for `new_vars`:",
          if_else(
            "subperiod" %in% mode,
            "subperiod: {.var {new_vars_names[mode == \"subperiod\"]}}",
            NA_character_
          ),
          if_else(
            "period" %in% mode,
            "period: {.var {new_vars_names[mode == \"period\"]}}",
            NA_character_
          ),
          if_else(
            "phase" %in% mode,
            "phase: {.var {new_vars_names[mode == \"phase\"]}}",
            NA_character_
          )
        ),
        is.na
      )
    )
  }
  if (mode == "subperiod") {
    id_vars <- exprs(APERIOD, ASPER)
  } else if (mode == "period") {
    id_vars <- exprs(APERIOD)
  } else {
    id_vars <- exprs(APHASEN)
  }

  assert_data_frame(dataset_ref, required_vars = expr_c(subject_keys, new_vars, id_vars))

  # Strip period reference dataset of any variables not required for later derivations
  ref_cols <- names(dataset_ref) %in% c(subject_keys, new_vars_chr, id_vars)
  ref_stripped <- dataset_ref[, ref_cols]

  ref_wide <- pivot_wider(
    ref_stripped,
    names_from = vars2chr(id_vars),
    names_glue = sprintf("{.value}_%s", paste0(sprintf("{%s}", vars2chr(id_vars)), collapse = "_")),
    values_from = unname(new_vars_chr)
  )

  # pivot_wider creates columns like APERSDT_1, APERSDT_2, ...
  # these need to be renamed to variables like AP01SDT, AP02SDT, ...
  rename_arg <- colnames(select(ref_wide, !!!negate_vars(subject_keys)))
  split_names <- str_match(rename_arg, "(\\w+?)_(\\d{1,2})_?(\\d)?")
  source_vars <- names(new_vars_chr)
  names(source_vars) <- new_vars_chr
  index <- split_names[, 3]
  if (mode == "phase") {
    names_rename_arg <- str_replace(source_vars[split_names[, 2]], "w", index)
  } else {
    # add leading zero for "xx" fragment
    index <- if_else(str_length(index) == 1, paste0("0", index), index)
    names_rename_arg <- str_replace(source_vars[split_names[, 2]], "xx", index)
    if (mode == "subperiod") {
      index2 <- split_names[, 4]
      names_rename_arg <- str_replace(names_rename_arg, "w", index2)
    }
  }
  names(rename_arg) <- names_rename_arg

  derive_vars_merged(
    dataset,
    dataset_add = ref_wide,
    by_vars = subject_keys
  ) %>% rename(all_of(rename_arg))
}