# __________________ #< 60cfc78f594e5611a6eaaf34a2b212ae ># __________________
# Summarization functions ####
## .................. #< eb074f5b5a560bb19ff907907f958da2 ># ..................
## Empty summaries ####
# Empty summary
create_empty_summary_ <- function(data, group_col){
checkmate::assert_string(group_col, min.chars = 1)
data %>%
dplyr::ungroup() %>%
dplyr::select(!!as.name(group_col)) %>%
dplyr::group_by(!!as.name(group_col)) %>%
dplyr::group_keys()
}
## .................. #< 79a1a50b2f6fd2bc4a62237a6399b27c ># ..................
## Size summaries ####
# Size summary
create_size_summary_ <- function(data, group_col, name="size"){
checkmate::assert_string(group_col, min.chars = 1)
checkmate::assert_string(name, min.chars = 1)
data %>%
dplyr::ungroup() %>%
dplyr::select(!!as.name(group_col)) %>%
dplyr::group_by(!!as.name(group_col)) %>%
dplyr::summarise(!!name := dplyr::n(),
.groups = "drop")
}
## .................. #< cad874f32d32a8eae09090d2894d7ad5 ># ..................
## ID column summaries ####
# Number of IDs/levels summary
create_id_summaries_ <- function(data, group_col, id_cols, name_prefix="# "){
checkmate::assert_string(group_col, min.chars = 1)
checkmate::assert_string(name_prefix, null.ok = TRUE)
checkmate::assert_character(
id_cols,
min.chars = 1,
any.missing = FALSE,
min.len = 1
)
summary <- data %>%
dplyr::ungroup() %>%
dplyr::select(!!!rlang::syms(c(group_col, id_cols))) %>%
dplyr::group_by(!!as.name(group_col)) %>%
dplyr::summarise(
dplyr::across(dplyr::one_of(id_cols), function(x){length(unique(x))}),
.groups = "drop"
)
if (!is.null(name_prefix) &&
nchar(name_prefix) > 0){
summary <- summary %>%
dplyr::rename_with(
~ paste0(name_prefix, .),
-dplyr::one_of(group_col)
)
}
summary
}
## .................. #< abd5e8f22decefad01cca729a155076c ># ..................
## Numerical summaries ####
# Numeric column summary
create_num_summaries_ <- function(data,
group_col,
num_cols,
fns = list("mean" = mean, "sum" = sum),
rename = FALSE) {
checkmate::assert_string(group_col, min.chars = 1)
checkmate::assert_character(
num_cols,
min.chars = 1,
any.missing = FALSE,
min.len = 1
)
checkmate::assert_flag(rename)
checkmate::assert_list(fns, types = "function")
if (length(fns) > 1 && is.null(names(fns))){
stop("When `fns` has length > 1, it must be named.")
}
if (length(fns) == 1 && is.null(names(fns))){
fns <- fns[[1]]
}
summary <- data %>%
dplyr::ungroup() %>%
dplyr::select(!!!rlang::syms(c(group_col, num_cols))) %>%
dplyr::group_by(!!as.name(group_col)) %>%
dplyr::summarise(dplyr::across(dplyr::one_of(num_cols),
fns),
.groups = "drop")
if (isTRUE(rename)){
summary <- summary %>%
# Rename so `xxx_mean <- mean(xxx)` and `xxx_sum <- sum(xxx)`
dplyr::rename_with(
~ gsub(
pattern = "(^.*)_([[:alnum:]]*)$",
replacement = "\\2(\\1)",
x = .
),
where(is.numeric)
)
}
summary
}
## .................. #< 440b147b963f8a7fd202661bfc3b068e ># ..................
## Categorical summaries ####
# Categorical summary
# One column per level per cat_col
create_cat_summaries_ <- function(data,
group_col,
cat_cols,
max_cat_prefix_chars = 5,
name_prefix = "# ") {
checkmate::assert_string(group_col, min.chars = 1)
checkmate::assert_character(
cat_cols,
min.chars = 1,
any.missing = FALSE,
min.len = 1
)
tmp_n_var <- create_tmp_var(data, tmp_var = ".n")
summary <- data %>%
dplyr::ungroup() %>%
dplyr::select(!!!rlang::syms(c(group_col, cat_cols))) %>%
dplyr::mutate(dplyr::across(dplyr::one_of(cat_cols), as.character)) %>%
tidyr::gather(key = "cat_col", value = "cat_val", dplyr::all_of(cat_cols)) %>%
dplyr::count(!!as.name(group_col), .data$cat_col, .data$cat_val,
name = tmp_n_var) %>%
rename_cat_levels_for_summary_(
max_cat_prefix_chars = max_cat_prefix_chars,
name_prefix = name_prefix
) %>%
dplyr::select(dplyr::one_of(group_col, "cat_name", tmp_n_var)) %>%
tidyr::spread(key = .data$cat_name,
value = !!as.name(tmp_n_var),
fill = 0)
summary
}
# NOTE: Must ensure the same names are produced in this
# and in create_cat_summaries_
create_cat_name_map_ <- function(data,
cat_cols,
max_cat_prefix_chars = 5,
name_prefix = "# "){
checkmate::assert_character(
cat_cols,
min.chars = 1,
any.missing = FALSE,
min.len = 1
)
data %>%
dplyr::ungroup() %>%
dplyr::select(!!!rlang::syms(c(cat_cols))) %>%
dplyr::mutate(dplyr::across(dplyr::one_of(cat_cols), as.character)) %>%
dplyr::distinct() %>%
tidyr::gather(key = "cat_col", value = "cat_val", dplyr::all_of(cat_cols)) %>%
dplyr::distinct() %>%
dplyr::mutate(orig_cat_col = .data$cat_col) %>%
position_first(col = "orig_cat_col") %>%
rename_cat_levels_for_summary_(
max_cat_prefix_chars = max_cat_prefix_chars,
name_prefix = name_prefix
) %>%
dplyr::arrange(.data$cat_col, .data$cat_val) %>%
dplyr::select(-"cat_col") %>%
split(f = .$orig_cat_col, drop = TRUE) %>%
purrr::map(.f = ~{
vals <- .x[["cat_name"]]
nms <- .x[["cat_val"]]
vals %>%
setNames(nms)
})
}
rename_cat_levels_for_summary_ <- function(cat_levels_map,
max_cat_prefix_chars = 5,
name_prefix = "# "){
checkmate::assert_data_frame(cat_levels_map)
checkmate::assert_count(max_cat_prefix_chars)
checkmate::assert_string(name_prefix, null.ok = TRUE)
cat_levels_map <- cat_levels_map %>%
dplyr::mutate(
cat_col = tolower(.data$cat_col),
cat_val = tolower(.data$cat_val),
cat_col = substr(.data$cat_col, start = 1, stop = max_cat_prefix_chars),
cat_name = paste0(.data$cat_col, "_", .data$cat_val),
cat_name = gsub("^_", "", .data$cat_name)
)
if (!is.null(name_prefix) &&
nchar(name_prefix) > 0){
cat_levels_map <- cat_levels_map %>%
dplyr::mutate(cat_name = paste0(name_prefix, .data$cat_name))
}
}
# Combined categorical summaries
# I.e. one single column per cat_col
create_combined_cat_summaries_ <- function(
data,
group_cols,
cat_cols,
cat_levels,
warn_zero_variance = TRUE
) {
checkmate::assert_character(
group_cols,
min.chars = 1,
any.missing = FALSE,
min.len = 1
)
checkmate::assert_character(
cat_cols,
min.chars = 1,
any.missing = FALSE,
min.len = 1
)
checkmate::assert_flag(warn_zero_variance)
purrr::map(.x = cat_cols, .f = ~ {
create_combined_cat_summary_(
data = data,
group_cols = group_cols,
cat_col = .x,
cat_levels = cat_levels,
warn_zero_variance = warn_zero_variance
)
}) %>%
purrr::reduce(dplyr::full_join, by = group_cols)
}
# Combined summary for single cat_col
create_combined_cat_summary_ <- function(data, group_cols, cat_col, cat_levels, warn_zero_variance=TRUE){
if (is.list(cat_levels)){
# When cat_col is not in cat_levels,
# this returns NULL
cat_levels <- cat_levels[[cat_col]]
}
cat_summary <- NULL
if (!is.null(cat_col)){
cat_summary <- data %>%
dplyr::count(!!!rlang::syms(c(group_cols, cat_col)))
if (is.null(cat_levels)){
checkmate::assert_factor(data[[cat_col]])
cat_levels <- levels(data[[cat_col]])
}
if (length(cat_levels) == 1 && cat_levels %in% c(".majority", ".minority")){
slice_fn <- ifelse(cat_levels == ".majority", which.max, which.min)
cat_levels <- data %>%
# TODO time whether summarize with the existing counts is faster
dplyr::count(!!as.name(cat_col)) %>%
dplyr::slice(slice_fn(n)) %>%
dplyr::pull(!!as.name(cat_col)) %>%
as.character()
}
# Convert to always be a named numeric vector (weights)
if (!is.numeric(cat_levels)) {
cat_levels <- rep(1, times = length(cat_levels)) %>%
setNames(nm = cat_levels)
}
# Get counts for the selected cat_levels
cat_summary <- cat_summary %>%
tidyr::spread(key = !!as.name(cat_col),
value = .data$n,
fill = 0) %>%
dplyr::select(!!!rlang::syms(c(group_cols, names(cat_levels))))
# Scale, weight and combine the `cat_levels` columns
cat_summary <- scale_and_combine_(
data = cat_summary,
weights = cat_levels,
include_cols = names(cat_levels),
scale_fn = standardize_,
col_name = cat_col,
handle_no_cols = "stop"
) %>%
dplyr::select(dplyr::one_of(group_cols, cat_col))
if (isTRUE(warn_zero_variance) &&
sd(cat_summary[[cat_col]]) == 0){
warning(simpleWarning(
paste0(
"Combining the standardized level counts",
" for the `cat_cols` column '",
cat_col,
"' led to a zero-variance vector. ",
"Consider not balancing this column or change the included `cat_levels`."
),
call = if (p <- sys.parent(5 + 1))
sys.call(p)
))
}
}
cat_summary
}
## .................. #< 60cfc78f594e5611a6eaaf34a2b212ae ># ..................
## Ranking utils ####
# Apply rank() to columns
rank_numeric_cols_ <- function(data, cols = NULL){
checkmate::assert_character(
cols,
any.missing = FALSE,
null.ok = TRUE,
min.chars = 1
)
if (is.null(cols)){
cols <- data %>%
dplyr::select(where(is.numeric)) %>%
colnames()
} else {
unknown_cols <- setdiff(cols, colnames(data))
if (length(unknown_cols) > 0){
stop(paste0("`cols` had unknown names: ", paste0(unknown_cols, collapse = ", ")))
}
}
data %>%
dplyr::mutate(dplyr::across(dplyr::one_of(cols), rank))
}
# Create column with weighted-average rank of numeric columns
mean_rank_numeric_cols_ <- function(
data,
cols = NULL,
col_name = "mean_rank",
rank_weights = NULL,
already_rank_cols = character(0)) {
if (is.null(cols)){
cols <- data %>%
dplyr::select(where(is.numeric)) %>%
colnames()
}
if (is.null(rank_weights)){
rank_weights <- rep(1, times = length(cols)) %>%
setNames(nm = cols)
}
# Calculate average SD rank
sd_ranks <- data %>%
rank_numeric_cols_(cols = setdiff(cols, already_rank_cols))
sd_ranks[[col_name]] <- sd_ranks %>%
dplyr::ungroup() %>%
dplyr::select(dplyr::one_of(cols)) %>%
purrr::pmap_dbl(.f = weighted_mean_, weights = rank_weights)
sd_ranks
}
# Calculates a weighted mean of named numbers
# ... and weights must have same names and length
weighted_mean_ <- function(..., weights){
# Check arguments ####
assert_collection <- checkmate::makeAssertCollection()
checkmate::assert_numeric(x = weights, lower = 0, add = assert_collection)
checkmate::reportAssertions(assert_collection)
# End of argument checks ####
# Order the row values and the weights
r <- c(...)
r <- r[order(names(r))]
weights <- weights[order(names(weights))]
if (!all(names(r) == names(weights))){
stop("`...` and `weights` must have the exact same names.")
}
# Sum to one
weights <- weights / sum(weights)
sum(r * weights)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.