R/create-table-one.R

Defines functions .cleanLabs .checkLabsMatchCats .formatContStats .formatCatStats .subsetCoh .subsetVars .checkVarsInStats .checkArgs dh.createTableOne

Documented in dh.createTableOne

#' Creates tables in useful formats for including in manuscripts
#'
#' dh.getStats extracts key statistics and stores them in a clientside list.
#' dh.createTableOne builds on this by formatting the extract stats into a
#' table which can then be included in a manuscript. Flexible formatting options
#' are included.
#'
#' @param stats Exported object from dh.getStats.
#' @param vars Variable to be included in table.
#' @param var_labs Tibble with two columns: 'variable' containing the
#' names of the variables specified in `vars`, and 'var_label' containing the
#' replacement labels for these variables.
#' @param cat_labs Tibble with three columns: 'variable' containing the
#' names of the categorical variables specified in `vars`, 'category'
#' containing the categories of these variabels, and "cat_label" containing
#' the replacement category labels for these variables.
#' @param coh_labs Tibble with two columns: 'cohort' containing the names
#' of all cohorts included in `stats`, and 'cohort_labs' containing the
#' replacement labels for these cohorts.
#' @param type Character specifying which cohorts to include in the table. If
#' "combined" then only combined stats will be returned, if "cohort" then only
#' cohort-specific stats will be returned, if "both" then everything will be
#' returned.
#' @param coh_direction Character specifying direction of data if `type` is
#' 'cohort' or 'both'. Use 'rows' to return cohorts as rows and variable as
#' columns, or use 'cols' to return cohorts as columns and variables as rows.
#' Defauls is "col".
#' @param cont_format Character specifying which summary statistic to return
#' for continuous stats. Use 'med_iqr' to return the median and interquartile
#' range, use 'mean_sd' to return the mean and standard deviation. Default is
#' "med_iqr".
#' @param inc_missing Boolean specifying whether to return missing values in
#' the output. Use TRUE for yes and FALSE for no.
#' @param sig_digits Optionally, the number of decimal places to round output
#' to. Default is 2.
#' @param perc_denom The denominator for percentages. Either 'valid' for valid
#' cases or 'total' for total cases.
#'
#' @return Tibble containing formatted summary statistics. If `coh_direction` is
#' 'cols', the tibble will contain four columns: 'cohort', 'variable',
#' 'category' & value. If `coh_direction` is rows, the tibble will contain the
#' column 'cohort' as well as as columns for all continuous variables and all
#' categories of categorical variables.
#'
#' @family descriptive functions
#'
#' @importFrom dplyr %>% left_join mutate select filter bind_rows
#' @importFrom tidyr pivot_wider pivot_longer
#' @importFrom rlang arg_match is_bool
#'
#' @export
dh.createTableOne <- function(stats = NULL, vars = NULL, var_labs = NULL,
                              cat_labs = NULL, type = NULL, coh_labs = NULL,
                              coh_direction = "cols", cont_format = NULL,
                              inc_missing = NULL, sig_digits = 3,
                              perc_denom = NULL) {
  variable <- . <- cat_label <- var_label <- cohort <- value <- data_type <-
    miss_n_perc <- category <- coh_label <- avail_stats <- vars_list <-
    stats_cat <- stats_cont <- old_var <- cohort_labs <- stats_sub_coh <-
    stats_sub_vars <- NULL

  .checkArgs(
    stats, vars, type, inc_missing, perc_denom, cat_labs, var_labs,
    coh_labs, coh_direction, cont_format
  )

  .checkVarsInStats(stats, vars)

  stats_sub_vars <- .subsetVars(stats, vars)

  stats_sub_coh <- .subsetCoh(type, stats_sub_vars)

  stats_cat <- tibble(
    cohort = character(),
    variable = character(),
    category = character(),
    value = character()
  )

  stats_cont <- stats_cat

  if (nrow(stats$categorical) > 0) {
    stats_cat <- .formatCatStats(stats_sub_coh, perc_denom, sig_digits)

    if (!is.null(cat_labs)) {
      .checkLabsMatchCats(stats_cat, cat_labs)
      cat_labs <- .cleanLabs(cat_labs, c("variable", "category", "cat_label"))

      stats_cat <- stats_cat %>%
        left_join(., cat_labs, by = c("variable", "category")) %>%
        dplyr::select(cohort, variable, category = cat_label, value)
    }
  }

  if (nrow(stats_sub_vars$continuous) > 0) {
    stats_cont <- .formatContStats(stats_sub_coh, cont_format, sig_digits)
  }

  out <- bind_rows(stats_cat, stats_cont) %>%
    dplyr::mutate(variable = factor(variable, levels = vars)) %>%
    arrange(variable)

  if (!is.null(var_labs)) {
    var_labs_clean <- .cleanLabs(var_labs, c("variable", "var_label"))

    out <- out %>%
      left_join(., var_labs_clean, by = "variable") %>%
      dplyr::select(cohort, variable = var_label, category, value)
  }

  if (!is.null(coh_labs)) {
    coh_labs <- .cleanLabs(coh_labs, c("cohort", "cohort_labs"))

    out <- left_join(out, coh_labs, by = "cohort") %>%
      dplyr::select(cohort = cohort_labs, variable, category, value)
  }

  if (inc_missing == FALSE) {
    out <- out %>%
      dplyr::filter(!is.na(category))
  }

  if (coh_direction == "cols") {
    out <- out %>%
      pivot_wider(
        names_from = cohort,
        values_from = value
      )
  } else if (coh_direction == "rows") {
    out <- out %>%
      pivot_wider(
        names_from = c(variable, category),
        values_from = value
      )
  }

  return(out)
}

#' Performs argument checks
#'
#' @return Nothing if checks pass, else throws an error
#'
#' @importFrom checkmate assert_list assert_character assert_choice
#' assert_logical assert_data_frame assert_subset
#'
#' @noRd
.checkArgs <- function(stats, vars, type, inc_missing, perc_denom, cat_labs,
                       var_labs, coh_labs, coh_direction, cont_format) {
  assert_list(stats)
  assert_character(vars)
  assert_choice(type, c("cohort", "combined", "both"))
  assert_choice(coh_direction, c("rows", "cols"))
  assert_choice(cont_format, c("med_iqr", "mean_sd"))
  assert_logical(inc_missing)
  assert_choice(perc_denom, c("valid", "total"))

  if (!is.null(cat_labs)) {
    assert_data_frame(cat_labs)
    assert_subset(c("variable", "category", "cat_label"), colnames(cat_labs))
  }

  if (!is.null(var_labs)) {
    assert_data_frame(var_labs)
    assert_subset(c("variable", "var_label"), colnames(var_labs))
    assert_subset(vars, var_labs$variable)
  }

  if (!is.null(coh_labs)) {
    assert_data_frame(coh_labs)
    assert_subset(c("cohort", "cohort_labs"), colnames(coh_labs))

    distinct_cohorts <- map(stats, ~ .x$cohort) %>%
      unlist() %>%
      unique()

    if (type == "cohort") {
      distinct_cohorts <- distinct_cohorts[distinct_cohorts != "combined"]
    }

    assert_subset(distinct_cohorts, coh_labs$cohort)
  }
}

#' Checks that all the variable names provided to `vars` are available in the
#' object provided to `stats`
#'
#'
#' @return Nothing if all variable provided in `vars` exist in `stats`, else
#' throws an error.
#'
#' @noRd
.checkVarsInStats <- function(stats, vars) {
  stats_vars <- stats %>%
    map(~ .x$variable) %>%
    unlist() %>%
    unique()

  avail <- vars[vars %in% stats_vars]
  not_avail <- vars[!vars %in% stats_vars]

  if (length(not_avail > 0)) {
    stop(
      paste0(
        "The following variables provided in `vars` are not present in the
      statistics provided in `stats`\n\n",
        paste0(not_avail, collapse = ", ")
      )
    )
  }
}

#' Subsets `stats` to include only variables specified in `vars`
#'
#' @return Subset of `stats`
#'
#' @noRd
.subsetVars <- function(stats, vars_to_keep) {
  stats %>%
    map(~ dplyr::filter(., variable %in% vars_to_keep))
}

#' Subset stats based on argument to `type`
#'
#' @noRd
.subsetCoh <- function(type, stats) {
  if (type == "combined") {
    out <- stats %>%
      map(~ dplyr::filter(., cohort == "combined"))
  } else if (type == "cohort") {
    out <- stats %>%
      map(~ dplyr::filter(., cohort != "combined"))
  } else if (type == "both") {
    out <- stats
  }
}

#' Performs the initial formatting of categorical statistics
#'
#'
#' @return Tibble containing 5 columns: 'cohort', 'variable', 'category',
#' 'value' and 'data_type'
#'
#' @importFrom dplyr %>% filter mutate select
#'
#' @noRd
#'
.formatCatStats <- function(stats, perc_denom, sig_digits) {
  perc_valid <- perc_total <- value <- category <- cohort <- variable <- NULL

  out <- stats$categorical %>%
    mutate(
      across(c(perc_valid, perc_total), ~ signif(., sig_digits))
    )

  if (perc_denom == "total") {
    out <- out %>%
      mutate(value = paste0(value, " (", perc_total, ")"))
  } else if (perc_denom == "valid") {
    out <- out %>%
      mutate(value = ifelse(
        !is.na(category),
        paste0(value, " (", perc_valid, ")"),
        paste0(value, " (", perc_total, ")")
      ))
  }

  out <- out %>%
    dplyr::select(cohort, variable, category, value)

  return(out)
}

#' Performs the initial formatting of continuous statistics
#'
#' @param stats Exported object from dh.getStats
#' @param vars Character vector of variable names
#' @param cont_stats Character specifying which summary statistic to return
#' for continuous stats. Use 'med_iqr' to return the median and interquartile
#' range, use 'mean_sd' to return the mean and standard deviation.
#' @param inc_missing Boolean specifying whether to return missing values in
#' the output. Use TRUE for yes and FALSE for no.
#'
#' @return Tibble containing 5 columns: 'cohort', 'variable', 'category',
#' 'value' and 'data_type'. If `inc_missing` is TRUE contains an sixth column
#' 'miss_n_perc'
#'
#' @importFrom dplyr %>% filter mutate select
#'
#' @noRd
.formatContStats <- function(stats, cont_format, sig_digits) {
  perc_95 <- missing_perc <- valid_n <- missing_n <- category <- std.dev <-
    perc_50 <- perc_25 <- perc_75 <- value <- cohort <- variable <- NULL

  out <- stats$continuous %>%
    mutate(
      across(
        c(mean:perc_95, missing_perc), ~ signif(., sig_digits)
      )
    )

  out <- out %>%
    pivot_longer(
      cols = c(valid_n, missing_n),
      names_to = "category",
      values_to = "missing"
    ) %>%
    mutate(category = ifelse(category == "missing_n", NA, cont_format))

  if (cont_format == "mean_sd") {
    out <- out %>%
      mutate(value = paste0(mean, " \u00b1 ", std.dev))
  } else if (cont_format == "med_iqr") {
    out <- out %>%
      mutate(value = paste0(perc_50, " (", perc_25, ",", perc_75, ")"))
  }

  out <- out %>%
    mutate(value = ifelse(is.na(category),
      paste0(missing, " (", missing_perc, ")"), value
    )) %>%
    dplyr::select(cohort, variable, category, value) %>%
    mutate(category = case_when(
      category == "med_iqr" ~ "Median \u00b1 (IQR)",
      category == "mean_sd" ~ "Mean \u00b1 SD"
    ))

  return(out)
}

#' Checks that all categorical variables provided to `stats_cat` have
#' corresponding labels provided in `cat_labs`
#'
#'
#' @return Returns an error if labels have not been provided for all levels of
#' all categorical variables.
#'
#' @importFrom dplyr left_join %>% filter
#'
#' @noRd
.checkLabsMatchCats <- function(stats_cat, cat_labs) {
  category <- cat_label <- NULL

  test_cats <- left_join(stats_cat, cat_labs, by = c("variable", "category")) %>%
    dplyr::filter(category != "missing")

  missing_cats <- test_cats %>%
    dplyr::filter(!is.na(category) & is.na(cat_label))

  if (nrow(missing_cats) > 0) {
    stop(
      "The following categorical variables are included in 'vars'
      but do not have a corresponding labels for all categories provided in
      `cat_labs`\n\n", unique(missing_cats$variable)
    )
  }
}

#' Removes duplicates and selects only required columns
#'
#' @return Cleaned dataframe of labels
#'
#' @noRd
.cleanLabs <- function(labs, cols) {
  out <- labs %>%
    dplyr::select(all_of(cols)) %>%
    distinct()

  return(out)
}
lifecycle-project/ds-cs-functions documentation built on Nov. 18, 2024, 3:36 p.m.