R/summarize_cat_cat_data.R

Defines functions summarize_cat_cat_data sort_data flip_exception_categories add_n_to_category add_n_to_label replace_with_empty_string add_collapsed_categories mutate_data_label category_var_as_fct

Documented in summarize_cat_cat_data

# Helper function to create overarching factor variable for the response categories column
category_var_as_fct <-
  function(data_summary, fct_unions) {
    data_summary$.category <- factor(
      x = data_summary$.category,
      levels = fct_unions,
      labels = fct_unions,
      exclude = NULL
    )
    data_summary
  }


mutate_data_label <-
  function(
      data,
      data_label = "count",
      digits = 1,
      hide_label_if_prop_below = 0,
      decimal_symbol = ".") {
    percent_siblings <- any(c("percentage", "percentage_bare") == data_label)
    prop_family <- any(c("percentage", "percentage_bare", "proportion") == data_label)
    stat_col <- if (prop_family) ".proportion" else stringi::stri_c(ignore_null = TRUE, ".", data_label)

    fmt <-
      stringi::stri_c(
        ignore_null = TRUE, "%.", if (data_label == "count") 0 else digits, "f",
        if (data_label == "percentage") "%%"
      )


    ## Could replace fmt <- with a switch of scales::label_percent and scales::label_number

    out <- data
    out$.data_label <-
      dplyr::if_else(out[[stat_col]] >= hide_label_if_prop_below,
        out[[stat_col]],
        rep(NA, times = nrow(out))
      )
    if (percent_siblings) out$.data_label <- out$.data_label * 100
    out$.data_label <- sprintf(fmt = fmt, out$.data_label)
    out$.data_label <- stringi::stri_replace(out$.data_label, regex = " *NA%*$", replacement = "")
    out$.data_label <- stringi::stri_replace(out$.data_label, regex = "\\.", replacement = decimal_symbol)

    out
  }


add_collapsed_categories <-
  function(data_summary,
           sort_by = NULL,
           data_label = ".proportion",
           categories_treated_as_na = c(),
           call = rlang::caller_env()) {
    check_sort_by(x = data_summary$.category, sort_by = sort_by, call = call)


    lvls <- levels(data_summary$.category)
    lvls <- lvls[!lvls %in% categories_treated_as_na]
    if (length(sort_by) == 1 &&
      any(.saros.env$summary_data_sort1 == sort_by)) {
      sort_by <- subset_vector(vec = lvls, set = sort_by)
    }

    non_grouping_vars <-
      c(
        ".category",
        ".count", ".count_se",
        ".proportion", ".proportion_se",
        ".mean", ".mean_se", # ".mean_base",
        ".data_label", ".sum_value"
      )

    data_summary |>
      dplyr::mutate(.comb_categories = as.character(.data$.category) %in% .env$sort_by) |>
      dplyr::mutate(
        .sum_value = sum(.data[[if (data_label == "count") ".count" else ".proportion"]], na.rm = TRUE), # THis should be an argument: counts or proportions.
        .sum_value = dplyr::if_else(condition = .data$.comb_categories, # NB! survey weights
          true = .data$.sum_value,
          false = NA_real_
        ),
        .by = !tidyselect::any_of(non_grouping_vars)
      )
  }

replace_with_empty_string <- function(str) {
  if (is.null(str) || is.na(str)) "" else str
}


add_n_to_label <- function(data_summary,
                           add_n_to_dep_label = FALSE,
                           add_n_to_dep_label_prefix = " (N = ",
                           add_n_to_dep_label_suffix = ")",
                           add_n_to_indep_label = FALSE,
                           add_n_to_indep_label_prefix = " (N = ",
                           add_n_to_indep_label_suffix = ")",
                           add_n_to_label = FALSE,
                           add_n_to_label_prefix = " (N = ",
                           add_n_to_label_suffix = ")") {
  if (isFALSE(add_n_to_dep_label) && isFALSE(add_n_to_indep_label) && isFALSE(add_n_to_label)) {
    return(data_summary)
  }


  add_n_to_dep_label_prefix <- replace_with_empty_string(add_n_to_dep_label_prefix)
  add_n_to_dep_label_suffix <- replace_with_empty_string(add_n_to_dep_label_suffix)
  add_n_to_indep_label_prefix <- replace_with_empty_string(add_n_to_indep_label_prefix)
  add_n_to_indep_label_suffix <- replace_with_empty_string(add_n_to_indep_label_suffix)
  add_n_to_label_prefix <- replace_with_empty_string(add_n_to_label_prefix)
  add_n_to_label_suffix <- replace_with_empty_string(add_n_to_label_suffix)

  if (isTRUE(add_n_to_dep_label)) {
    add_to_var <- ".variable_label"
    count_var <- ".count_per_dep"
    levels(data_summary[[add_to_var]]) <-
      paste0(
        levels(data_summary[[add_to_var]]),
        add_n_to_dep_label_prefix,
        unique(data_summary[order(as.integer(data_summary[[add_to_var]])), count_var, drop = TRUE]),
        add_n_to_dep_label_suffix
      )
  }
  if (isTRUE(add_n_to_indep_label)) {
    add_to_var <- names(data_summary)[!names(data_summary) %in% .saros.env$summary_data_sort2]
    count_var <- ".count_per_indep_group"
    if (length(add_to_var)) {
      data_summary <-
        dplyr::arrange(
          data_summary,
          as.integer(.data[[add_to_var]]),
          .data[[".variable_label"]]
        )
      data_summary[[add_to_var]] <-
        factor(
          paste0(
            as.character(data_summary[[add_to_var]]),
            add_n_to_indep_label_prefix,
            data_summary[[count_var]],
            add_n_to_indep_label_suffix
          ),
          levels = unique(paste0(
            as.character(data_summary[[add_to_var]]),
            add_n_to_indep_label_prefix,
            data_summary[[count_var]],
            add_n_to_indep_label_suffix
          )),
          exclude = character(),
          ordered = is.ordered(data_summary[[add_to_var]])
        )
    }
  }


  data_summary
}



add_n_to_category <- function(data_summary,
                              add_n_to_category = FALSE,
                              add_n_to_category_prefix = " (N = ",
                              add_n_to_category_infix = ",",
                              add_n_to_category_suffix = ")") {
  if (isFALSE(add_n_to_category)) {
    return(data_summary)
  }

  add_n_to_category_prefix <- replace_with_empty_string(add_n_to_category_prefix)
  add_n_to_category_infix <- replace_with_empty_string(add_n_to_category_infix)
  add_n_to_category_suffix <- replace_with_empty_string(add_n_to_category_suffix)

  out <-
    data_summary |>
    dplyr::mutate(
      .category_n_rng = paste0(range(.data[[".count"]], na.rm = TRUE),
        collapse = add_n_to_category_infix
      ),
      .by = tidyselect::all_of(".category")
    ) |>
    tidyr::unite(
      col = ".category_new",
      tidyselect::all_of(c(".category", ".category_n_rng")),
      sep = add_n_to_category_prefix, remove = FALSE, na.rm = TRUE
    ) |>
    dplyr::mutate(.category_new = paste0(.data[[".category_new"]], add_n_to_category_suffix))
  new_labels <- unique(out$.category_new)
  names(new_labels) <- levels(out$.category)
  levels(out$.category) <- new_labels
  out
}



flip_exception_categories <- function(data_summary,
                                      sort_by = NULL,
                                      categories_treated_as_na = c(),
                                      call = rlang::caller_env()) {
  if (is.null(sort_by) ||
    length(categories_treated_as_na) == 0 ||
    !all(sort_by %in% .saros.env$summary_data_sort1) ||
    !any(unique(data_summary$.category) %in% categories_treated_as_na)) {
    return(data_summary)
  }
  releveling_helper_function <- function(lvls) {
    # c(lvls[lvls %in% categories_treated_as_na], lvls[!lvls %in% categories_treated_as_na])

    lvls[lvls %in% categories_treated_as_na]
  }
  position <-
    if (sort_by %in% c(".top", ".upper", ".mid_upper")) 0 else max(as.integer(factor(data_summary$.category)))

  data_summary$.category <- forcats::fct_relevel(data_summary$.category,
    releveling_helper_function,
    after = position
  )
  data_summary
}


# Helper function that sorts output (if !is.null(sort_by)), optionally in descending order
sort_data <- function(data_summary,
                      sort_by = NULL,
                      descend = FALSE,
                      labels_always_at_bottom = NULL,
                      labels_always_at_top = NULL,
                      translations = eval(formals(makeme)$translations),
                      indep_names = character(0),
                      call = rlang::caller_env()) {
  if (is.null(sort_by)) {
    return(dplyr::arrange(data_summary, as.integer(.data$.variable_label), as.integer(.data$.category)))
  }

  if (all(sort_by %in% names(data_summary))) { # E.g. .variabel_name, .variable_label, x1_sex

    sort_col <- sort_by
  } else if ((length(sort_by) == 1 &&
    sort_by %in% .saros.env$summary_data_sort1) || # E.g. .top, .upper, .mid_upper, c("A bit", "A lot")
    all(sort_by %in% unique(data_summary$.category))) {
    sort_col <- c(
      ".sum_value",
      indep_names[lapply(indep_names, function(indep_name) is.ordered(data_summary[[indep_name]])) |> unlist()],
      ".category"
    )
  }
  #### descend IS CURRENTLY GLOBAL ACROSS ALL VARIABLES:

  if (isTRUE(descend)) {
    data_summary <-
      dplyr::arrange(data_summary, dplyr::across(tidyselect::all_of(sort_col), dplyr::desc))
  } else {
    data_summary <-
      dplyr::arrange(data_summary, dplyr::pick(tidyselect::all_of(sort_col))) # Incorrect
  }

  uniques <- as.character(unique(data_summary$.variable_label))

  if (!all(is.na(data_summary$.variable_label))) {
    data_summary$.variable_label <- forcats::fct_relevel(data_summary$.variable_label, uniques)

    labels_always_at_bottom <- labels_always_at_bottom[labels_always_at_bottom %in% uniques]
    data_summary$.variable_label <- forcats::fct_relevel(data_summary$.variable_label,
      labels_always_at_bottom,
      after = length(uniques)
    )

    labels_always_at_top <- labels_always_at_top[labels_always_at_top %in% uniques]
    data_summary$.variable_label <- forcats::fct_relevel(data_summary$.variable_label,
      labels_always_at_top,
      after = 0
    )
  }

  if (length(indep_names) > 0) {
    for (indep_name in indep_names) {
      if (is.factor(data_summary[[indep_name]])) {
        uniques <- rev(levels(data_summary[[indep_name]]))
      } else {
        uniques <- rev(as.character(unique(data_summary[[indep_name]])))
      }
      data_summary[[indep_name]] <- forcats::fct_relevel(data_summary[[indep_name]], uniques)
      if (any(levels(data_summary[[indep_name]]) %in% translations$crowd_others)) {
        data_summary[[indep_name]] <- forcats::fct_relevel(data_summary[[indep_name]],
          translations$crowd_others,
          after = length(uniques)
        )
      }
    }
  }
  data_summary
}

#' Summarize a survey dataset for use in tables and graphs
#'
#' @inheritParams makeme
#' @param call *Internal call*
#'
#'   `obj:<call>` // *Default:* `rlang::caller_env()` (`optional`)
#'
#'   Both the absolute and relative folderpaths are required, as strings.
#' @importFrom rlang !!!
#' @keywords internal
#' @return Dataset with the columns: `.variable_name`, `.variable_label`, `.category`,
#'   `.count`, `.count_se`, `.count_per_dep`, `.count_per_indep_group`, `.proportion`, `.proportion_se`,
#'   `.mean`, `.mean_se`, indep-variable(s), `.data_label`, `.comb_categories`, `.sum_value`,
#'   `.variable_label_prefix`
#'
summarize_cat_cat_data <-
  function(data,
           dep = colnames(data),
           indep = NULL,
           ...,
           showNA = c("ifany", "always", "never"),
           totals = FALSE,
           sort_by = ".upper",
           data_label = c("percentage_bare", "percentage", "proportion", "count"),
           digits = 0,
           add_n_to_dep_label = FALSE,
           add_n_to_indep_label = FALSE,
           add_n_to_label = FALSE,
           add_n_to_category = FALSE,
           hide_label_if_prop_below = .01,
           data_label_decimal_symbol = ".",
           categories_treated_as_na = NULL,
           label_separator = NULL,
           descend = FALSE,
           labels_always_at_bottom = NULL,
           labels_always_at_top = NULL,
           translations = list(),
           call = rlang::caller_env()) {
    showNA <- rlang::arg_match(showNA)
    data_label <- rlang::arg_match(data_label)


    if (!(inherits(data, what = "data.frame") || !inherits(data, what = "survey"))) {
      cli::cli_abort("{.arg data} should be a data.frame/tibble or survey object, not {.obj_type_friendly {data}}.")
    }

    if (any(dep %in% indep)) {
      return()
    }


    cross_table_output <-
      crosstable(data,
        dep = dep,
        indep = indep,
        showNA = showNA,
        totals = totals,
        translations = translations
      )

    check_sort_by(x = cross_table_output$.category, sort_by = sort_by)


    # fct_unions <- get_common_levels(data=data, col_pos=match(dep, colnames(data)))
    fct_unions <- levels(cross_table_output[[".category"]])


    cross_table_output |>
      mutate_data_label(
        data_label = data_label,
        digits = digits,
        hide_label_if_prop_below = hide_label_if_prop_below,
        decimal_symbol = data_label_decimal_symbol
      ) |>
      category_var_as_fct(fct_unions = fct_unions) |>
      add_collapsed_categories(
        sort_by = sort_by,
        categories_treated_as_na = categories_treated_as_na,
        data_label = data_label
      ) |>
      dplyr::mutate(
        .variable_label_prefix = get_main_question(.data$.variable_label,
          label_separator = label_separator,
          warn_multiple = FALSE
        ),
        .variable_label = keep_subitem(
          fct = .data$.variable_label,
          label_separator = label_separator
        )
      ) |>
      add_n_to_label(
        add_n_to_dep_label = add_n_to_dep_label,
        add_n_to_dep_label_prefix = translations$add_n_to_dep_label_prefix,
        add_n_to_dep_label_suffix = translations$add_n_to_dep_label_suffix,
        add_n_to_indep_label = add_n_to_indep_label,
        add_n_to_indep_label_prefix = translations$add_n_to_indep_label_prefix,
        add_n_to_indep_label_suffix = translations$add_n_to_indep_label_suffix,
        add_n_to_label = add_n_to_label,
        add_n_to_label_prefix = translations$add_n_to_label_prefix,
        add_n_to_label_suffix = translations$add_n_to_label_suffix
      ) |>
      add_n_to_category(
        add_n_to_category = add_n_to_category,
        add_n_to_category_prefix = translations$add_n_to_category_prefix,
        add_n_to_category_infix = translations$add_n_to_category_infix,
        add_n_to_category_suffix = translations$add_n_to_category_suffix
      ) |>
      flip_exception_categories(
        categories_treated_as_na = categories_treated_as_na,
        sort_by = sort_by
      ) |>
      sort_data(
        indep_names = indep,
        sort_by = sort_by,
        descend = descend,
        labels_always_at_bottom = labels_always_at_bottom,
        labels_always_at_top = labels_always_at_top,
        translations = translations
      )
  }

Try the saros package in your browser

Any scripts or data that you put into this service are public.

saros documentation built on June 8, 2025, 10:43 a.m.