R/assign.R

Defines functions assign_ct assign_no_ct sdtm_assign

Documented in assign_ct assign_no_ct sdtm_assign

#' Derive an SDTM variable
#'
#' @description
#' [sdtm_assign()] is an internal function packing the same functionality as
#' [assign_no_ct()] and [assign_ct()] together but aimed at developers only.
#' As a user please use either [assign_no_ct()] or [assign_ct()].
#'
#' @param raw_dat The raw dataset (dataframe); must include the
#'   variables passed in `id_vars` and `raw_var`.
#' @param raw_var The raw variable: a single string indicating the name of the
#'   raw variable in `raw_dat`.
#' @param tgt_var The target SDTM variable: a single string indicating the name
#'   of variable to be derived.
#' @param ct_spec Study controlled terminology specification: a dataframe with a
#'   minimal set of columns, see [ct_spec_vars()] for details. This parameter is
#'   optional, if left as `NULL` no controlled terminology recoding is applied.
#' @param ct_clst A codelist code indicating which subset of the controlled
#'   terminology to apply in the derivation. This parameter is optional, if left
#'   as `NULL`, all possible recodings in `ct_spec` are attempted.
#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by
#'   the variables indicated in `id_vars`. This parameter is optional, see
#'   section Value for how the output changes depending on this argument value.
#' @param id_vars Key variables to be used in the join between the raw dataset
#'   (`raw_dat`) and the target data set (`tgt_dat`).
#'
#' @returns The returned data set depends on the value of `tgt_dat`:
#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to
#' `NULL`, then the returned data set is `raw_dat`, selected for the variables
#' indicated in `id_vars`, and a new extra column: the derived variable, as
#' indicated in `tgt_var`.
#' - If the target dataset is provided, then it is merged with the raw data set
#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the
#' derived variable, as indicated in `tgt_var`.
#'
#'
#' @importFrom rlang :=
#' @keywords internal
sdtm_assign <- function(tgt_dat = NULL,
                        tgt_var,
                        raw_dat,
                        raw_var,
                        ct_spec = NULL,
                        ct_clst = NULL,
                        id_vars = oak_id_vars()) {
  admiraldev::assert_character_scalar(raw_var)
  admiraldev::assert_character_scalar(tgt_var)
  admiraldev::assert_character_vector(id_vars)
  assertthat::assert_that(contains_oak_id_vars(id_vars),
    msg = "`id_vars` must include the oak id vars."
  )
  admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var)))
  admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE)
  assert_ct_spec(ct_spec, optional = TRUE)
  assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE)

  join_dat <-
    raw_dat |>
    dplyr::select(dplyr::all_of(c(id_vars, raw_var))) |>
    sdtm_join(tgt_dat = tgt_dat, id_vars = id_vars)

  # Recode the raw variable following terminology.
  tgt_val <- ct_map(join_dat[[raw_var]], ct_spec = ct_spec, ct_clst = ct_clst)

  # Current target values
  cur_tgt_val <- join_dat[[tgt_var]] %||% as.vector(NA, mode = typeof(tgt_val))

  join_dat |>
    mutate("{tgt_var}" := dplyr::coalesce(cur_tgt_val, tgt_val)) |> # nolint object_name_linter()
    dplyr::select(-dplyr::any_of(setdiff(raw_var, tgt_var))) |>
    dplyr::relocate(dplyr::all_of(tgt_var), .after = dplyr::last_col())
}

#' Derive an SDTM variable
#'
#' @description
#' - [assign_no_ct()] maps a variable in a raw dataset to a target SDTM
#' variable that has no terminology restrictions.
#'
#' - [assign_ct()] maps a variable in a raw dataset to a target SDTM variable
#' following controlled terminology recoding.
#'
#' @param raw_dat The raw dataset (dataframe); must include the
#'   variables passed in `id_vars` and `raw_var`.
#' @param raw_var The raw variable: a single string indicating the name of the
#'   raw variable in `raw_dat`.
#' @param tgt_var The target SDTM variable: a single string indicating the name
#'   of variable to be derived.
#' @param ct_spec Study controlled terminology specification: a dataframe with a
#'   minimal set of columns, see [ct_spec_vars()] for details.
#' @param ct_clst A codelist code indicating which subset of the controlled
#'   terminology to apply in the derivation.
#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by
#'   the variables indicated in `id_vars`. This parameter is optional, see
#'   section Value for how the output changes depending on this argument value.
#' @param id_vars Key variables to be used in the join between the raw dataset
#'   (`raw_dat`) and the target data set (`raw_dat`).
#'
#' @returns The returned data set depends on the value of `tgt_dat`:
#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to
#' `NULL`, then the returned data set is `raw_dat`, selected for the variables
#' indicated in `id_vars`, and a new extra column: the derived variable, as
#' indicated in `tgt_var`.
#' - If the target dataset is provided, then it is merged with the raw data set
#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the
#' derived variable, as indicated in `tgt_var`.
#'
#' @examples
#'
#' md1 <-
#'   tibble::tibble(
#'     oak_id = 1:14,
#'     raw_source = "MD1",
#'     patient_number = 101:114,
#'     MDIND = c(
#'       "NAUSEA", "NAUSEA", "ANEMIA", "NAUSEA", "PYREXIA",
#'       "VOMITINGS", "DIARHHEA", "COLD",
#'       "FEVER", "LEG PAIN", "FEVER", "COLD", "COLD", "PAIN"
#'     )
#'   )
#'
#' assign_no_ct(
#'   tgt_var = "CMINDC",
#'   raw_dat = md1,
#'   raw_var = "MDIND"
#' )
#'
#' cm_inter <-
#'   tibble::tibble(
#'     oak_id = 1:14,
#'     raw_source = "MD1",
#'     patient_number = 101:114,
#'     CMTRT = c(
#'       "BABY ASPIRIN",
#'       "CORTISPORIN",
#'       "ASPIRIN",
#'       "DIPHENHYDRAMINE HCL",
#'       "PARCETEMOL",
#'       "VOMIKIND",
#'       "ZENFLOX OZ",
#'       "AMITRYPTYLINE",
#'       "BENADRYL",
#'       "DIPHENHYDRAMINE HYDROCHLORIDE",
#'       "TETRACYCLINE",
#'       "BENADRYL",
#'       "SOMINEX",
#'       "ZQUILL"
#'     ),
#'     CMROUTE = c(
#'       "ORAL",
#'       "ORAL",
#'       NA,
#'       "ORAL",
#'       "ORAL",
#'       "ORAL",
#'       "INTRAMUSCULAR",
#'       "INTRA-ARTERIAL",
#'       NA,
#'       "NON-STANDARD",
#'       "RANDOM_VALUE",
#'       "INTRA-ARTICULAR",
#'       "TRANSDERMAL",
#'       "OPHTHALMIC"
#'     )
#'   )
#'
#' # Controlled terminology specification
#' (ct_spec <- read_ct_spec_example("ct-01-cm"))
#'
#' assign_ct(
#'   tgt_dat = cm_inter,
#'   tgt_var = "CMINDC",
#'   raw_dat = md1,
#'   raw_var = "MDIND",
#'   ct_spec = ct_spec,
#'   ct_clst = "C66729"
#' )
#'
#' # Variables are derived in sequence from multiple input sources.
#' # For each target variable, only missing (`NA`) values are filled
#' # during each step—previously assigned (non-missing) values are retained.
#'
#' cm_raw <-
#'   tibble::tibble(
#'     oak_id = 1:4,
#'     raw_source = "cm_raw",
#'     patient_number = 370L + oak_id,
#'     PATNUM = patient_number,
#'     IT.CMTRT = c("BABY ASPIRIN", "CORTISPORIN", NA, NA),
#'     IT.CMTRTOTH = c("Other Treatment - ", NA, "Other Treatment - Baby Aspirin", NA)
#'   )
#'
#' cm_raw
#'
#' # Derivation of `CMTRT` first from `IT.CMTRT` and then from `IT.CMTRTOTH`.
#' assign_no_ct(
#'   raw_dat = cm_raw,
#'   raw_var = "IT.CMTRT",
#'   tgt_var = "CMTRT"
#' ) |>
#'   assign_no_ct(
#'     raw_dat = cm_raw,
#'     raw_var = "IT.CMTRTOTH",
#'     tgt_var = "CMTRT"
#'   )
#'
#' # Derivation of `CMTRT` first from `IT.CMTRTOTH` and then from `IT.CMTRT`.
#' assign_no_ct(
#'   raw_dat = cm_raw,
#'   raw_var = "IT.CMTRTOTH",
#'   tgt_var = "CMTRT"
#' ) |>
#'   assign_no_ct(
#'     raw_dat = cm_raw,
#'     raw_var = "IT.CMTRT",
#'     tgt_var = "CMTRT"
#'   )
#'
#' # Another example of variables derived in sequence from multiple input
#' # sources but now with controlled terminology remapping, in this case,
#' # CDISC Dose Unit (C71620) recoding.
#'
#' cm_raw2 <- tibble::tibble(
#'   oak_id = c(1:3, 6, 8:10, 12:14),
#'   raw_source = "cm_raw",
#'   patient_number = c(rep(375L, 2), 376:377, rep(378L, 3), rep(379L, 3)),
#'   PATNUM = patient_number,
#'   `IT.DOSUO` = c(NA, NA, NA, NA, NA, "Other Dose Unit", "cap", NA, NA, NA),
#'   `IT.CMDOSU` = c("mg", "Gram", NA, "Tablet", "g", "mg", NA, "IU", "mL", "%")
#' )
#'
#' assign_ct(
#'   raw_dat = cm_raw2,
#'   raw_var = "IT.DOSUO",
#'   tgt_var = "CMDOSU",
#'   ct_spec = ct_spec,
#'   ct_clst = "C71620",
#'   # Dose Unit
#'   id_vars = oak_id_vars()
#' ) |>
#'   assign_ct(
#'     raw_dat = cm_raw2,
#'     raw_var = "IT.CMDOSU",
#'     tgt_var = "CMDOSU",
#'     ct_spec = ct_spec,
#'     ct_clst = "C71620",
#'     id_vars = oak_id_vars()
#'   )
#'
#' @name assign
NULL

#' @order 1
#' @export
#' @rdname assign
assign_no_ct <- function(tgt_dat = NULL,
                         tgt_var,
                         raw_dat,
                         raw_var,
                         id_vars = oak_id_vars()) {
  admiraldev::assert_character_scalar(raw_var)
  admiraldev::assert_character_scalar(tgt_var)
  admiraldev::assert_character_vector(id_vars)
  assertthat::assert_that(contains_oak_id_vars(id_vars),
    msg = "`id_vars` must include the oak id vars."
  )
  admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var)))
  admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE)

  sdtm_assign(
    tgt_dat = tgt_dat,
    tgt_var = tgt_var,
    raw_dat = raw_dat,
    raw_var = raw_var,
    id_vars = id_vars
  )
}

#' @order 2
#' @export
#' @rdname assign
assign_ct <- function(tgt_dat = NULL,
                      tgt_var,
                      raw_dat,
                      raw_var,
                      ct_spec,
                      ct_clst,
                      id_vars = oak_id_vars()) {
  admiraldev::assert_character_scalar(raw_var)
  admiraldev::assert_character_scalar(tgt_var)
  admiraldev::assert_character_vector(id_vars)
  assertthat::assert_that(contains_oak_id_vars(id_vars),
    msg = "`id_vars` must include the oak id vars."
  )
  admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var)))
  admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE)

  sdtm_assign(
    tgt_dat = tgt_dat,
    tgt_var = tgt_var,
    raw_dat = raw_dat,
    raw_var = raw_var,
    id_vars = id_vars,
    ct_spec = ct_spec,
    ct_clst = ct_clst
  )
}

Try the sdtm.oak package in your browser

Any scripts or data that you put into this service are public.

sdtm.oak documentation built on June 9, 2025, 5:10 p.m.