R/pivot.R
In dunlin: Preprocessing Tools for Clinical Trial Data

Documented in multi_id_pivot_wider poly_pivot_wider

#' Transforming data.frame with Multiple Identifying columns into Wide Format
#'
#' @details This function allows to identify observations on the basis of several columns. Warning: Instead of nesting
#'   duplicated values, the function will throw an error if the same parameter is provided twice for the same
#'   observation.
#'
#' @param data (`data.frame`) to be pivoted.
#' @param id (`character`) the name of the columns whose combination uniquely identify the observations.
#' @param param_from (`character`) the name of the column containing the names of the parameters to be pivoted. The
#'   unique values in this column will become column names in the output.
#' @param value_from (`character`) the name of the column containing the values that will populate the output.
#' @param drop_na (`logical`) should column containing only `NAs` be dropped.
#' @param drop_lvl (`logical`) should missing levels be dropped in the columns coming from (`value_from`).
#'
#' @returns `data.frame` in a wide format.
#'
#' @export
#' @examples
#' test_data <- data.frame(
#'   the_obs = c("A", "A", "A", "B", "B", "B", "C", "D"),
#'   the_obs2 = c("Ax", "Ax", "Ax", "Bx", "Bx", "Bx", "Cx", "Dx"),
#'   the_param = c("weight", "height", "gender", "weight", "gender", "height", "height", "other"),
#'   the_val = c(65, 165, "M", 66, "F", 166, 155, TRUE)
#' )
#'
#' multi_id_pivot_wider(test_data, c("the_obs", "the_obs2"), "the_param", "the_val")
#' multi_id_pivot_wider(test_data, "the_obs2", "the_param", "the_val")
multi_id_pivot_wider <- function(data,
                                 id,
                                 param_from,
                                 value_from,
                                 drop_na = FALSE,
                                 drop_lvl = FALSE) {
  # check for duplication of observation-parameter
  checkmate::assert_data_frame(data, min.rows = 1, min.cols = 3)
  checkmate::assert_character(id)
  checkmate::assert_character(param_from, len = 1)
  checkmate::assert_character(value_from, len = 1)
  checkmate::assert_false(any(duplicated(data[, c(id, param_from)])))
  checkmate::assert_subset(c(id, param_from, value_from), colnames(data))
  checkmate::assert_flag(drop_na)
  checkmate::assert_flag(drop_lvl)

  # find a way to sort
  unique_id <- unique(data[id])
  key <- apply(unique_id[id], 1, paste, collapse = "-")
  unique_id <- cbind(key, unique_id)

  param <- data[[param_from]]

  mini_data <- data[, c(param_from, value_from)]
  f_key <- apply(data[id], 1, paste, collapse = "-")
  mini_data <- cbind(f_key, mini_data)

  data_ls <- split(mini_data, param)

  # Transform to named vector, the first column is the key.
  data_vec <-
    lapply(
      data_ls,
      function(x) setNames(x[[value_from]], x[, 1])
    )

  if (drop_lvl) {
    data_vec <- rapply(data_vec, droplevels, classes = "factor", how = "replace")
  }

  # query each id in each param
  all_vec <- lapply(data_vec, function(x) x[unique_id[, 1]])

  if (drop_na) all_vec <- Filter(function(x) !all(is.na(x)), all_vec)

  all_vec <- lapply(all_vec, unname)
  bind_data <- do.call(dplyr::bind_cols, all_vec)

  res <- dplyr::bind_cols(unique_id[, -1, drop = FALSE], bind_data)

  rownames(res) <- NULL
  res
}

#' Transforming data.frame with multiple Data Column into Wide Format
#'
#' @details This function is adapted to cases where the data are distributed in several columns while the name of the
#'   parameter is in one. Typical example is `adsub` where numeric data are stored in `AVAL` while categorical data are
#'   in `AVALC`.
#'
#' @param data (`data.frame`) to be pivoted.
#' @param id (`character`) the name of the columns whose combination uniquely identify the observations.
#' @param param_from (`character`) the name of the columns containing the names of the parameters to be pivoted. The
#'   unique values in this column will become column names in the output.
#' @param value_from (`character`) the name of the column containing the values that will populate the output.
#' @param labels_from (`character`) the name of the column congaing the labels of the new columns. from. If not
#'   provided, the labels will be equal to the column names. When several labels are available for the same column, the
#'   first one will be selected.
#' @param drop_na (`logical`) should column containing only `NAs` be dropped.
#' @param drop_lvl (`logical`) should missing levels be dropped in the columns coming from `value_from`.
#'
#' @returns `list` of `data.frame` in a wide format with label attribute attached to each columns.
#'
#' @export
#' @examples
#' test_data <- data.frame(
#'   the_obs = c("A", "A", "A", "B", "B", "B", "C", "D"),
#'   the_obs2 = c("Ax", "Ax", "Ax", "Bx", "Bx", "Bx", "Cx", "Dx"),
#'   the_param = c("weight", "height", "gender", "weight", "gender", "height", "height", "other"),
#'   the_label = c(
#'     "Weight (Kg)", "Height (cm)", "Gender", "Weight (Kg)",
#'     "Gender", "Height (cm)", "Height (cm)", "Pre-condition"
#'   ),
#'   the_val = c(65, 165, NA, 66, NA, 166, 155, NA),
#'   the_val2 = c(65, 165, "M", 66, "F", 166, 155, TRUE)
#' )
#'
#' x <- poly_pivot_wider(
#'   test_data,
#'   c("the_obs", "the_obs2"),
#'   "the_param",
#'   c("the_val", "the_val2"),
#'   "the_label"
#' )
#' x
#' Reduce(function(u, v) merge(u, v, all = TRUE), x)
poly_pivot_wider <- function(data,
                             id,
                             param_from,
                             value_from,
                             labels_from = NULL,
                             drop_na = TRUE,
                             drop_lvl = FALSE) {
  # other tests are performed at lower levels.
  checkmate::assert_character(value_from, unique = TRUE)

  # Create new labels for new columns.
  if (is.null(labels_from) || labels_from == param_from) {
    new_labels <- unique(data[[param_from]])
    names(new_labels) <- new_labels
  } else {
    checkmate::assert_character(labels_from, len = 1)
    checkmate::assert_subset(labels_from, colnames(data))

    new_labels_df <- data[, c(labels_from, param_from)]
    new_labels_df <- unique(new_labels_df)

    new_labels <- as.character(new_labels_df[[labels_from]])
    names(new_labels) <- as.character(new_labels_df[[param_from]])
  }

  # Retrieve old labels.
  old_labels <- lapply(data, attr, "label")
  n_old_label <- names(old_labels)
  null_label <- unlist(lapply(old_labels, is.null))
  old_labels[null_label] <- n_old_label[null_label]
  old_labels <- unlist(old_labels)

  all_labels <- c(new_labels, old_labels)

  res_ls <- list()
  for (n_value_from in value_from) {
    res <- multi_id_pivot_wider(
      data = data,
      id = id,
      param_from = param_from,
      value_from = n_value_from,
      drop_na = drop_na,
      drop_lvl = drop_lvl
    )

    res <- attr_label_df(res, all_labels[colnames(res)])
    res_ls[[n_value_from]] <- res
  }
  res_ls
}