#' @title tabulate
#' @description
#' tji asd asd sda sad sa d tji asd asd sda sad sa tji asd asd sda sad sa tji asd asd sda sad sa
#' tji asd asd sda sad sa tji asd asd sda sad sa tji asd asd sda sad sa tji asd asd sda sad sa
#' #'
#' @param data A dataframe
#' @param groups [str] string vector of variables or tidy supported selection ex. matches(...)
#' @param cols [str] string vector of variables or tidy supported selection ex. matches(...)
#' @param weights [str] string vector of variables or tidy supported selection ex. matches(...)
#' @param samples [str] string vector of variables or tidy supported selection ex. matches(...)
#' @param return_mean [boolean] for nominal variables
#' @param values_drop_na [boolean] if the base and pct should include NAs
#' @param variable_sep [str] regex used to separate the variable column
#' @param variable_sep_suffix [str] name of the variable_sep right-hand side output column
#' @param keep_empty_levels [str] no levels imputations
#' @return frequency table, dataframe/tibble, A standardized ET
#'
#' @import dplyr
#' @import tidyr
#' @importFrom Hmisc wtd.mean wtd.var
#' @importFrom haven is.labelled
#' @importFrom rlang is_empty
#' @importFrom magrittr `%<>%`
#' @importFrom glue glue
#' @importFrom purrr imap_dfr map_dfr map_lgl
#' @import stringr
#' @import utils
#' @export
#'
#' @examples
#'
#' tabulate(iris, Species)
#'
tabulate <- function(data, cols, weights = NULL, groups = NULL, samples = NULL,
return_mean = FALSE, values_drop_na = TRUE,
variable_sep = NULL, variable_sep_suffix = NULL, keep_empty_levels = FALSE) {
msg_param_conflict <- "Note: If data contains missing values (NA's) mean calculation will return missing values (NA's)"
msg_fullbase <- "Note: values_drop_na=FALSE increases calculation time by a factor of 3 (approximately)"
msg_separate <- "Note: Separating variable components, this may take some time..."
msg_weight_overlap <- "WARNING: weights overlap with other arguments"
msg_ignore_weight <- "Ignoring weights of type character: {paste(no_good, collapse = ',')}"
msg_ignore_question <- "Ignoring cols of type character [return_mean=TRUE]: {paste(no_good, collapse = ',')}"
msg_no_question_left <- "No numeric questions left"
msg_no_weight_left <- "No numeric weights left"
msg_inputet_rows_excisitn_levels <- "Rows imputed due to missing values on existing levels: {nrow(extra_empty_cols)}"
cols <- colnames(dplyr::select(data,{{cols}}))
if (!rlang::is_empty(expr(weights))) weights <- colnames(select(data,{{weights}} ) ); if(rlang::is_empty(weights)) weights <- NULL
if (!rlang::is_empty(expr(groups))) groups <- colnames(select(data,{{groups}} ) ); if(rlang::is_empty(groups)) groups <- NULL
if (!rlang::is_empty(expr(samples))) samples <- colnames(select(data,{{samples}} ) ); if(rlang::is_empty(samples)) samples <- NULL
# Check for varibles present
if (!values_drop_na) message(msg_fullbase)
in_cols <- c(cols, weights, groups, samples)
c_missing <- in_cols[!(in_cols %in% colnames(data))] %>% paste(collapse = ",")
if (!is_empty(c_missing) && c_missing != "") stop(glue::glue("Missing column(s): {c_missing}"))
data %<>% select(any_of(c(cols, weights, groups, samples)))
# Making a label and level dictionary to fill empty levels with at the end
if(keep_empty_levels){
levels_df <- data %>%
select(all_of(cols)) %>% select(where(~{haven::is.labelled(.x)|is.factor(.x)})) %>%
distinct() %>%
mutate(across(everything(), ~{
attr_var <- ifelse(!is.null(attr(.,"labels")),"labels","levels")
factor(.x, levels = attr(.x, attr_var)) })) %>%
purrr::imap_dfr(~{tibble(variable = .y, levels = levels(.x)) })
}
# Check cols type to calculate mean
if (return_mean) {
cols <- check_and_ignore_wrong_type(data = data, variables = cols,
warning_msg = msg_ignore_question,
stop_msg = msg_no_question_left)
}
# Check weights type to calculate, and ignore character only.
if (!is.null(weights) ){
weights <- check_and_ignore_wrong_type(data = data, variables = weights,
warning_msg = msg_ignore_weight,
stop_msg = msg_no_weight_left)
}
# Make everything class character to be able to turn/pivot everything
data %<>%
mutate(across(all_of(unique(c(cols, groups, samples, weights))),
.fns = ~as.character(.x)))
# This is to duplicate only the columns that overlap, not all
overlapping_questions_sample <- intersect(cols, samples)
overlapping_questions_group <- intersect(cols, groups)
overlapping_samples_group <- intersect(groups, samples)
overlapping_weight <- intersect(unique(c(cols, samples, groups)), weights)
if (!is_empty(overlapping_weight)) message(msg_weight_overlap)
# Number #1 to #3 turns data. Number #4 renames the grouping vars with a group_ prefix.
#1
if (!is.null(weights)) {
data %<>% gather_longer(variables = weights, .names_to = "weight", .values_to = "weight_value",
.keep_variables = unique(overlapping_weight))
} else {
data %<>% mutate(weight = "unweighted", weight_value = 1)
}
#2
data %<>% gather_longer(variables = cols, .names_to = "variable_output", .values_to = "value_output",
.keep_variables = unique(c(overlapping_questions_sample, overlapping_questions_group)),
.values_drop_na = values_drop_na)
#3
if (!is_empty(samples)) {
data %<>% gather_longer(variables = samples, .names_to = "sample_class", .values_to = "sample",
.keep_variables = unique(overlapping_samples_group))
}
#4
if (!is.null(groups)) {
data %<>% rename_with(.fn = ~{paste0("group_",.)}, .cols = matches(paste0("^",groups,"$")))
groups <- paste("group", groups, sep = "_")
}
# This renaming in the beginning allow for value/variable as input names in groups/samples args
data %<>% rename(value = value_output, variable = variable_output)
# Calculate percentages
if (!return_mean) {
res <- data %>%
mutate(weight_value = as.double(weight_value)) %>%
group_by(across(matches("^(sample|weight$|variable$)")), across(all_of(groups)), value, .drop = TRUE) %>%
summarise(n = sum(weight_value, na.rm = TRUE), .groups = "drop_last") %>%
mutate(base = sum(n), pct = n / base)
} else if (return_mean) {
# Calculate means
if (!values_drop_na) message(msg_param_conflict)
res <- data %>%
mutate(value = as.double(value),
weight_value = as.double(weight_value)) %>%
group_by(across(matches("^(sample|weight$|variable$)")), across(all_of(groups)), .drop = TRUE) %>%
mutate(weight_value_notna = ifelse(is.na(value), NA, weight_value)) %>%
summarise(base = sum(weight_value_notna, na.rm = values_drop_na),
mean = Hmisc::wtd.mean(weights = weight_value, x = value, na.rm = values_drop_na),
stdev = sqrt(Hmisc::wtd.var(weights = weight_value, x = value, na.rm = values_drop_na)),
.groups = "drop_last") %>%
mutate(n = base, value = "numeric")
}
# Output column order
column_order_char <- paste0("^", c("sample", "group", "weight", "variable", "value"))
column_order_num <- paste0("^", c("n", "base", "pct", "mean", "stdev"), "$")
res %<>% ungroup() %>%
relocate(matches(c(column_order_char, column_order_num)), .after = last_col())
# The empty levels are filled with 0 here.
if (keep_empty_levels && nrow(levels_df)>0){
# make key to check levels within each group
res %<>% unite("fake_res_key", matches(rev(column_order_char)[-1]), remove = FALSE)
extra_empty_cols <-
unique(res$fake_res_key) %>%
purrr::map_dfr(~{
current_key_var <- filter(res, fake_res_key == .x) %>% pull(variable) %>% unique()
levels_for_current_var <- filter(levels_df, variable == current_key_var) %>% pull(levels)
current_var_resulsts_df <- filter(res, fake_res_key == .x)
fake_res <- current_var_resulsts_df %>%
mutate(value = "tmp") %>%
group_by(across(.cols = -matches(column_order_num) ) ) %>%
slice(1) %>% ungroup() %>%
mutate(n = 0 , pct = 0)
value_for_current_var <- current_var_resulsts_df %>% pull(value) %>% unique()
level_not_in_results <- setdiff(levels_for_current_var, value_for_current_var)
purrr::map_dfr(level_not_in_results, ~{mutate(fake_res, value = .x) })
})
message(glue::glue(msg_inputet_rows_excisitn_levels))
res <- bind_rows(res, extra_empty_cols) %>% select(-fake_res_key)
res %<>% arrange(across(matches(column_order_char)))
}
# Separation of variable into variable and (brand)code
if (is.null(variable_sep_suffix)) variable_sep_suffix <- "brandcode"
if (!is.null(variable_sep)) {
message(msg_separate)
new_col <- glue::glue("group_{variable_sep_suffix}")
res %<>% tidyr::separate(variable, into = c("variable", new_col), sep = variable_sep,
convert = TRUE, fill = "right", extra = "merge") %>%
relocate(all_of(new_col), .before = variable)
}
return(res)
}
utils::globalVariables(c("weight_value", "value", "base","weight_value_notna","fake_res_key","variable","variable_output","value_output")) # https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887
utils::globalVariables("where") # https://stackoverflow.com/questions/40251801/how-to-use-utilsglobalvariables
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.