R/hb_data.R
In historicalborrow: Non-Longitudinal Bayesian Historical Borrowing Models

Documented in hb_data

#' @title Standardize data
#' @export
#' @family data
#' @description Standardize a tidy input dataset.
#' @details Users do not normally need to call this function.
#'   It mainly serves exposes the indexing behavior of
#'   studies and group levels to aid in interpreting
#'   summary tables.
#' @return A standardized tidy data frame with one row per patient
#'   and the following columns:
#'   * `response`: continuous response/outcome variable. (Should be
#'     change from baseline of an outcome of interest.)
#'   * `study_label`: human-readable label of the study.
#'   * `study`: integer study index with the max index equal to the
#'     current study (at `study_reference`).
#'   * `group_label`: human-readable group label (e.g. treatment arm name).
#'   * `group`: integer group index with an index of 1 equal to the control
#'     group (at `group_reference`).
#'   * `patient_label`: original patient ID.
#'   * `patient`: integer patient index.
#'   * `covariate_*`: baseline covariate columns.
#' @param data A tidy data frame or `tibble` with the data.
#' @param response Character of length 1,
#'   name of the column in `data` with the response/outcome variable.
#'   `data[[response]]` must be a continuous variable,
#'   and it *should* be the change from baseline of a
#'   clinical endpoint of interest, as opposed to just
#'   the raw response. Treatment differences
#'   are computed directly from this scale, please supply
#'   change from baseline unless you are absolutely certain
#'   that treatment differences computed directly from
#'   this quantity are clinically meaningful.
#' @param study Character of length 1,
#'   name of the column in `data` with the study ID.
#' @param study_reference Atomic of length 1,
#'   element of the `study` column that indicates
#'   the current study.
#'   (The other studies are historical studies.)
#' @param group Character of length 1,
#'   name of the column in `data` with the group ID.
#' @param group_reference Atomic of length 1,
#'   element of the `group` column that indicates
#'   the control group.
#'   (The other groups may be treatment groups.)
#' @param patient Character of length 1,
#'   name of the column in `data` with the patient ID.
#' @param covariates Character vector of column names
#'   in `data` with the columns with baseline covariates.
#'   These can be continuous, categorical, or binary.
#'   Regardless, `historicalborrow` derives the appropriate
#'   model matrix.
#' @examples
#' data <- hb_sim_independent(n_continuous = 1, n_study = 2)$data
#' data <- dplyr::select(
#'   data,
#'   study,
#'   group,
#'   patient,
#'   response,
#'   tidyselect::everything()
#' )
#' colnames(data) <- c("trial", "arm", "subject", "change", "cov1", "cov2")
#' data$trial <- paste0("trial", data$trial)
#' data$arm <- paste0("arm", data$arm)
#' hb_data(
#'   data = data,
#'   response = "change",
#'   study = "trial",
#'   study_reference = "trial1",
#'   group = "arm",
#'   group_reference = "arm1",
#'   patient = "subject",
#'   covariates = c("cov1", "cov2")
#' )
hb_data <- function(
  data,
  response,
  study,
  study_reference,
  group,
  group_reference,
  patient,
  covariates
) {
  true(is.data.frame(data))
  true(response, !anyNA(.), is.character(.), length(.) == 1)
  true(study, !anyNA(.), is.character(.), length(.) == 1)
  true(group, !anyNA(.), is.character(.), length(.) == 1)
  true(patient, !anyNA(.), is.character(.), length(.) == 1)
  true(covariates, !anyNA(.), is.character(.))
  true(group_reference, !anyNA(.), length(.) == 1)
  true(study_reference, !anyNA(.), length(.) == 1)
  true(dim(data), length(.) == 2, . > 0)
  true(response %in% colnames(data))
  true(study %in% colnames(data))
  true(group %in% colnames(data))
  true(patient %in% colnames(data))
  true(all(covariates %in% colnames(data)))
  true(study_reference %in% data[[study]])
  true(group_reference %in% data[[group]])
  true(is.numeric(data[[response]]))
  true(data[[response]], is.atomic(.), is.vector(.))
  true(data[[study]], is.atomic(.), is.vector(.), !anyNA(.))
  true(data[[group]], is.atomic(.), is.vector(.), !anyNA(.))
  true(data[[patient]], is.atomic(.), is.vector(.), !anyNA(.))
  for (covariate in covariates) {
    true(data[[covariate]], is.atomic(.), is.vector(.), !anyNA(.))
  }
  out <- tibble::tibble(
    response = data[[response]],
    study_label = data[[study]],
    group_label = data[[group]],
    patient_label = data[[patient]],
    study = as_index_max(data[[study]], max = study_reference),
    group = as_index_min(data[[group]], min = group_reference),
    patient = as_index_min(data[[patient]])
  )
  for (x in covariates) {
    name <- if_any(grepl("^covariate", x), x, paste0("covariate_", x))
    out[[name]] <- data[[x]]
  }
  dplyr::arrange(out, study, group, patient)
}

as_index_min <- function(x, min = base::min(x)) {
  levels <- c(min, setdiff(sort(unique(x)), min))
  out <- as.integer(ordered(x, levels = levels))
  out - min(out) + 1L
}

as_index_max <- function(x, max = base::max(x)) {
  levels <- c(setdiff(sort(unique(x)), max), max)
  out <- as.integer(ordered(x, levels = levels))
  out - min(out) + 1L
}