Nothing
#' Fast and Powerful yet Simple to Use Summarise
#'
#' @description
#' [summarise_plus()] creates a new aggregated data table with the desired grouping.
#' It can output only the deepest nested combination of the grouping variables (default)
#' or you can also output every possible combination of the grouping variables at once,
#' with just one small change. Besides the normal summary functions like sum, mean
#' or median, you can also calculate their respective weighted version by just
#' setting a weight variable.
#'
#' @param data_frame A data frame to summarise.
#' @param class A vector containing all grouping variables.
#' @param values A vector containing all variables that should be summarised.
#' @param statistics Available functions:
#' - "sum" -> Weighted and unweighted sum
#' - "sum_wgt" -> Sum of all weights
#' - "freq" -> Unweighted frequency
#' - "freq_g0" -> Unweighted frequency of all values greater than zero
#' - "pct_group" -> Weighted and unweighted percentages within the respective group
#' - "pct_total" -> Weighted and unweighted percentages compared to the grand total
#' - "mean" -> Weighted and unweighted mean
#' - "median" -> Weighted and unweighted median
#' - "mode" -> Weighted and unweighted mode
#' - "min" -> Minimum
#' - "max" -> Maximum
#' - "sd" -> Weighted and unweighted standard deviation
#' - "variance" -> Weighted and unweighted standard variance
#' - "first" -> First value
#' - "last" -> Last value
#' - "pn" -> Weighted and unweighted percentiles (any p1, p2, p3, ... possible)
#' - "missing" -> Missings generated by the value variables
#' @param formats A list in which is specified which formats should be applied to which
#' class variables.
#' @param types A character vector specifying the different combinations of group
#' variables which should be computed when using nesting = "all". If left empty all
#' possible combinations will be computed.
#' @param weight Put in a weight variable to compute weighted results.
#' @param nesting The predefined value is "deepest" meaning that only the fully
#' nested version of all class variables will be computed. If set to "all", all
#' possible combinations will be computed in one data table. The option "single"
#' only outputs the ungrouped summary of all class variables in one data table.
#' @param merge_back Newly summarised variables can be merged back to the original
#' data frame if TRUE. Only works if nested = "deepest and no formats are defined.
#' @param na.rm FALSE by default. If TRUE removes all NA values from the class variables.
#' @param monitor FALSE by default. If TRUE, outputs two charts to visualize the
#' functions time consumption.
#' @param notes TRUE by default. Prints notifications about NA values produced by
#' class variables during summarise.
#'
#' @details
#' [summarise_plus()] is based on the 'SAS' procedure Proc Summary, which provides
#' efficient and readable ways to perform complex aggregations.
#'
#' Normally you would compute new categorical variables beforehand - probably even in
#' different forms, if you wanted to have different categorizations - and bloat up
#' the data set. After all this recoding footwork you could finally use multiple
#' summaries to compute all the stats you need to then put them back together. With this
#' function this is no more necessary.
#'
#' In [summarise_plus()] you put in the original data frame and let the recoding happen
#' via format containers. This is very efficient, since new variables and categories
#' are only created just before the summarise happens.
#'
#' Additionally you can specify whether you only want to produce the all nested version
#' of all group variables or whether you want to produce every possible combination in
#' one go. All with a single option.
#'
#' The function is optimized to always take the fastest route, depending on the options
#' specified.
#'
#' @return
#' Returns a summarised data table.
#'
#' @seealso
#' Creating formats: [discrete_format()] and [interval_format()].
#'
#' Functions that also make use of formats: [frequencies()], [crosstabs()],
#' [any_table()], [recode()], [recode_multi()].
#'
#' @examples
#' # Example formats
#' age. <- discrete_format(
#' "Total" = 0:100,
#' "under 18" = 0:17,
#' "18 to under 25" = 18:24,
#' "25 to under 55" = 25:54,
#' "55 to under 65" = 55:64,
#' "65 and older" = 65:100)
#'
#' sex. <- discrete_format(
#' "Total" = 1:2,
#' "Male" = 1,
#' "Female" = 2)
#'
#' income. <- interval_format(
#' "Total" = 0:99999,
#' "below 500" = 0:499,
#' "500 to under 1000" = 500:999,
#' "1000 to under 2000" = 1000:1999,
#' "2000 and more" = 2000:99999)
#'
#' # Example data frame
#' my_data <- dummy_data(1000)
#'
#' # Call function
#' all_nested <- my_data |>
#' summarise_plus(class = c(year, sex, age),
#' values = income,
#' statistics = c("sum", "pct_group", "pct_total", "sum_wgt", "freq"),
#' formats = list(sex = sex., age = age.),
#' weight = weight,
#' nesting = "deepest",
#' na.rm = TRUE)
#'
#' all_possible <- my_data |>
#' summarise_plus(class = c(year, sex, age, income),
#' values = c(probability),
#' statistics = c("sum", "p1", "p99", "min", "max", "freq", "freq_g0"),
#' formats = list(sex = sex.,
#' age = age.,
#' income = income.),
#' weight = weight,
#' nesting = "all",
#' na.rm = TRUE)
#'
#'# Formats can also be passed as characters
#' single <- my_data |>
#' summarise_plus(class = c(year, age, sex),
#' values = weight,
#' statistics = c("sum", "mean"),
#' formats = list(sex = "sex.", age = "age."),
#' nesting = "single")
#'
#' merge_back <- my_data |>
#' summarise_plus(class = c(year, age, sex),
#' values = weight,
#' statistics = c("sum", "mean"),
#' nesting = "deepest",
#' merge_back = TRUE)
#'
#' certain_types <- my_data |>
#' summarise_plus(class = c(year, sex, age),
#' values = c(probability),
#' statistics = c("sum", "mean", "freq"),
#' formats = list(sex = sex.,
#' age = age.),
#' types = c("year", "year + age", "age + sex"),
#' weight = weight,
#' nesting = "all",
#' na.rm = TRUE)
#'
#' @export
summarise_plus <- function(data_frame,
class = NULL,
values,
statistics = c("sum", "freq"),
formats = c(),
types = c(),
weight = NULL,
nesting = "deepest",
merge_back = FALSE,
na.rm = FALSE,
monitor = FALSE,
notes = TRUE){
# Measure the time
start_time <- Sys.time()
monitor_df <- NULL |> monitor_start("Error handling", "Preparation")
# First convert data frame to data table
if (!data.table::is.data.table(data_frame)){
data_frame <- data.table::as.data.table(data_frame)
}
# Evaluate formats early, otherwise apply formats can't evaluate them in unit
# test situation.
formats_list <- as.list(substitute(formats))[-1]
if (length(formats_list) > 0){
formats <- stats::setNames(
lapply(formats_list, function(expression){
# Catch expression if passed as string
if (is.character(expression)) {
tryCatch(get(expression, envir = parent.frame()),
error = function(e) NULL)
}
# Catch expression if passed as symbol
else{
tryCatch(eval(expression, envir = parent.frame()),
error = function(e) NULL)
}
}),
names(formats_list))
}
# Look up variable names in format data frame to check whether there is an
# interval or discrete format
flag_interval <- FALSE
for (current_var in names(formats)){
format_df <- formats[[current_var]]
interval_variables <- c("from", "to")
actual_variables <- names(format_df)[1:2]
if (identical(interval_variables, actual_variables)){
flag_interval <- TRUE
break
}
}
###########################################################################
# Error handling
###########################################################################
# Correct nesting option if not set right
if (!nesting %in% c("deepest", "all", "single")){
message(" ! WARNING: Nested option '", nesting, "' doesn't exist. Options 'deepest', 'all' and 'single'\n",
" are available. Nested will be set to 'deepest'.")
nesting <- "deepest"
}
weight_temp <- sub("^list\\(", "c(", gsub("\"", "", deparse(substitute(weight), width.cutoff = 500L)))
# Create temporary weight column if none is provided.
# Also get the name of the weight variable as string.
if (weight_temp == "NULL" || substr(weight_temp, 1, 2) == "c("){
weight_var <- ".temp_weight"
data_frame[[".temp_weight"]] <- 1
if (substr(weight_temp, 1, 2) == "c("){
message(" ! WARNING: Only one variable for weight allowed. Evaluations will be unweighted.")
}
}
else if (!is_error(weight)){
# In this case weight already contains the substituted variable name
# while weight_temp is evaluated to the symbol passed into the function.
weight_var <- weight
}
else if (!weight_temp %in% names(data_frame)){
weight_var <- ".temp_weight"
data_frame[[".temp_weight"]] <- 1
message(" ! WARNING: Provided weight variable is not part of the data frame. Unweighted results will be computed.")
}
else if (!is_numeric(data_frame[[weight_temp]])){
weight_var <- ".temp_weight"
data_frame[[".temp_weight"]] <- 1
message(" ! WARNING: Provided weight variable is not numeric. Unweighted results will be computed.")
}
else{
weight_var <- weight_temp
# NA values in weight lead to errors therefor convert them to 0
if (anyNA(data_frame[[weight_temp]])){
message(" ~ NOTE: Missing values in weight variable '", weight_temp, "' will be converted to 0.")
}
data_frame[[weight_temp]] <- data.table::fifelse(is.na(data_frame[[weight_temp]]), 0, data_frame[[weight_temp]])
# @Hack: so I don't have to check if .temp_weight exists later on
data_frame[[".temp_weight"]] <- 1
}
# Setup necessary options per code if merge_back so that there are no errors
# even if the user didn't set them up correctly
if (merge_back){
if (nesting != "deepest" || length(formats) > 0){
message(" ! WARNING: Merging variables back only works with nesting = 'deepest' and only without formats.\n",
" Options will be set accordingly.")
}
nesting <- "deepest"
formats <- list()
}
list_of_statistics <- get_complete_statistics_list(statistics)
# Convert to character vectors
class_temp <- sub("^list\\(", "c(", gsub("\"", "", deparse(substitute(class), width.cutoff = 500L)))
if (substr(class_temp, 1, 2) == "c("){
class <- as.character(substitute(class))
}
else if (!is_error(class)){
# Do nothing. In this case class already contains the substituted variable names
# while class_temp is evaluated to the symbol passed into the function.
}
else{
class <- class_temp
}
# Remove extra first character created with substitution
class <- class[class != "c"]
# Convert to character vectors
values_temp <- sub("^list\\(", "c(", gsub("\"", "", deparse(substitute(values), width.cutoff = 500L)))
if (substr(values_temp, 1, 2) == "c("){
values <- as.character(substitute(values))
}
else if (!is_error(values)){
# Do nothing. In this case values already contains the substituted variable names
# while values_temp is evaluated to the symbol passed into the function.
}
else{
values <- values_temp
}
# Remove extra first character created with substitution
values <- values[values != "c"]
# If no value variables are provided abort
if (length(values) == 0){
message(" X ERROR: No values provided.")
return(NULL)
}
else if (length(values) == 1){
if (values == ""){
message(" X ERROR: No values provided.")
return(NULL)
}
}
# Make sure there is no class variable that is also a value variable.
invalid_class <- class[class %in% values]
values <- values[!values %in% class]
if (length(invalid_class) > 0){
message(" ! WARNING: The provided class variable '", paste(invalid_class, collapse = ", "), "' is also part of\n",
" the analysis variables. This variable will be omitted as analysis variable during computation.")
}
# Make sure that the variables provided are part of the data frame.
provided_class <- class
invalid_class <- class[!class %in% names(data_frame)]
class <- class[class %in% names(data_frame)]
if (length(invalid_class) > 0){
message(" ! WARNING: The provided class variable '", paste(invalid_class, collapse = ", "), "' is not part of\n",
" the data frame. This variable will be omitted during computation.")
}
# If no grouping variables are provided create a pseudo grouping variable
if (length(class) == 0){
class <- "pseudo_class"
data_frame[["pseudo_class"]] <- 1
}
provided_values <- values
invalid_values <- values[!values %in% names(data_frame)]
values <- values[values %in% names(data_frame)]
if (length(invalid_values) > 0){
message(" ! WARNING: The provided analysis variable '", paste(invalid_values, collapse = ", "), "' is not part of\n",
" the data frame. This variable will be omitted during computation.")
}
if (length(values) == 0){
message(" X ERROR: No valid analysis variables provided. Summarise will be aborted.")
return(NULL)
}
# Make sure provided variable list has no double entries
provided_class <- class
class <- unique(class)
if (length(provided_class) > length(class)){
message(" ! WARNING: Some grouping variables are provided more than once. The doubled entries will be omitted.")
}
provided_values <- values
values <- unique(values)
if (length(provided_values) > length(values)){
message(" ! WARNING: Some analysis variables are provided more than once. The doubled entries will be omitted.")
}
rm(class_temp, provided_class, invalid_class, values_temp, provided_values, invalid_values)
###########################################################################
# Summarisation starts
###########################################################################
monitor_df <- monitor_df |> monitor_next("Pre compute", "Preparation")
# If types are specified reorder them alphabetically
if (length(types) > 0){
reordered_types <- reorder_combination(types)
}
# Get the intersection of the requested statistics to make sure
# only valid actions are passed down
statistics <- tolower(statistics)
requested <- unique(unlist(list(statistics)))
valid_stats <- requested[requested %in% names(list_of_statistics)]
selected_stats <- list_of_statistics[valid_stats]
# Check if there are any statistics selected which aren't any kind of sums.
# If only sums are selected then it is faster to summarise first without formats
# and then apply the formats to a much smaller data frame.
only_sums <- valid_stats[!valid_stats %in% c("sum", "sum_wgt", "freq", "freq_g0",
"missing", "pct_group", "pct_total")]
# Get the group vars first
group_vars <- class
# Keep hold of original data frame to calculate correct missings for messages and
# for merge back option
original_df <- data_frame
result_df <- NULL
# Determine whether shortcut is possible
flag_shortcut <- FALSE
if (length(only_sums) == 0 && !flag_interval && !na.rm){
flag_shortcut <- TRUE
}
# Pre summarise data frame if only sums as statistics selected
if (flag_shortcut){
monitor_df <- monitor_df |> monitor_end()
# Summarise first
result_list <- data_frame |>
matrix_summarise(values,
group_vars,
data_frame[[weight_var]],
selected_stats,
list_of_statistics,
monitor_df)
data_frame <- result_list[[1]]
monitor_df <- result_list[[2]]
rm(result_list)
# Catch new variable names
values <- data_frame |> inverse(group_vars)
}
monitor_df <- monitor_df |> monitor_next("Apply formats", "Apply formats")
# If only the combination of every grouping variable should be evaluated
if (tolower(nesting) == "deepest"){
# Remove NAs from grouping variables
if (na.rm){
data_frame <- data_frame[stats::complete.cases(data_frame[class]), ]
}
message("\n > Executing nested merge.")
get_group_missings(original_df[group_vars], notes, na.rm)
if (!merge_back){
rm(original_df)
}
if (flag_shortcut){
# Convert numeric variables back which have become characters during summarisation
# and apply formats
result_df <- data_frame |>
convert_numeric(group_vars) |>
apply_format(formats, group_vars)
# Final summarise with formatted data frame
result_df <- result_df |>
collapse::fgroup_by(group_vars) |>
collapse::fsummarise(across(values, collapse::fsum))
monitor_df <- monitor_df |> monitor_end()
result_list <- list(result_df, monitor_df)
fast_pct <- TRUE
}
else{
# Apply formats first
result_df <- data_frame |>
apply_format(formats, group_vars)
monitor_df <- monitor_df |> monitor_end()
# If there are multiple evaluation types given, every variable gets an
# extension to it's variable name
# If there is only one evaluation type given, the original variable names
# are kept without extension
result_list <- result_df |>
matrix_summarise(values,
group_vars,
result_df[[weight_var]],
selected_stats,
list_of_statistics,
monitor_df)
fast_pct <- FALSE
}
# Compute percentages
result_list <- compute_group_percentages(data_frame,
result_list[[1]],
statistics,
group_vars,
formats,
values,
weight_var,
list_of_statistics,
result_list[[2]],
fast_pct)
result_list <- compute_total_percentages(data_frame,
result_list[[1]],
statistics,
group_vars,
values,
weight_var,
list_of_statistics,
result_list[[2]],
fast_pct)
# Split results and monitor
result_df <- result_list[[1]]
monitor_df <- result_list[[2]]
monitor_df <- monitor_df |> monitor_start("Clean up", "Clean up")
# Generate TYPE variables
type <- paste(group_vars, collapse = "+")
result_df[["TYPE"]] <- type
result_df[["TYPE_NR"]] <- 1
result_df[["DEPTH"]] <- length(group_vars)
# Reorder new variables in order of requested statistics
result_df <- result_df |>
handle_sum_drops(statistics) |>
reorder_summarised_columns(requested)
# If no formats are used (meaning only pre defined variables) it is
# possible to merge the summarized variables back to the original
# data frame in one go.
if (merge_back){
message("\n > Merging back.")
monitor_df <- monitor_df |> monitor_next("Merge back", "Merge back")
# Don't merge back type variables, only summarised variable
result_df <- result_df |> drop_type_vars()
result_df <- original_df |>
collapse::fungroup() |>
collapse::join(result_df,
on = group_vars,
how = "left",
verbose = FALSE) |>
dropp(".temp_weight")
}
monitor_df <- monitor_df |> monitor_end()
monitor_plot(monitor_df, by = "section", draw_plot = monitor)
}
# If every possible combination of the given grouping variables should be evaluated
else if (tolower(nesting) %in% c("all", "single")){
all_results <- list()
index <- 1
message("\n > Executing merge:\n",
" + total")
# The results of all the possible combinations are computed one after another
# starting with the grand total (ungrouped)
if (flag_shortcut){
# Final summarise
total_df <- data_frame |>
convert_numeric(group_vars) |>
collapse::fsummarise(across(values, collapse::fsum))
sum_columns <- values[grepl("_sum$", values)]
new_group_pct <- paste0(gsub("_sum$", "", sum_columns), "_pct_group")
new_total_pct <- paste0(gsub("_sum$", "", sum_columns), "_pct_total")
monitor_df <- monitor_df |> monitor_end()
total_list <- list(total_df, monitor_df)
}
else{
monitor_df <- monitor_df |> monitor_end()
total_list <- data_frame |>
collapse::fungroup() |>
dropp(class) |>
matrix_summarise(values,
NULL,
data_frame[[weight_var]],
selected_stats,
list_of_statistics,
monitor_df)
sum_columns <- paste0(values, "_sum")
new_group_pct <- paste0(values, "_pct_group")
new_total_pct <- paste0(values, "_pct_total")
}
# Split results and monitor
total_df <- total_list[[1]]
monitor_df <- total_list[[2]]
monitor_df <- monitor_df |> monitor_start("pct_group(total)", "Calc(total)")
# Copy total sums to have them ready for the group data frames. This way
# they can be easily joined to compute total percentages faster.
new_columns <- paste0(sum_columns, "_qol")
total_df_copy <- data.table::copy(total_df) |>
keep(sum_columns) |>
collapse::frename(stats::setNames(new_columns, sum_columns))
# Compute percentages
if ("pct_group" %in% statistics){
# For compute percentages for every variable
for (i in seq_along(new_group_pct)) {
current_new_var <- new_group_pct[i]
total_df[[current_new_var]] <- 100
}
}
monitor_df <- monitor_df |> monitor_next("pct_total(total)", "Calc(total)")
if ("pct_total" %in% statistics){
# For compute percentages for every variable
for (i in seq_along(new_total_pct)) {
current_new_var <- new_total_pct[i]
total_df[[current_new_var]] <- 100
}
}
# Every grouping variable which was not part of the current grouping
# gets set to a missing value
total_df[group_vars] <- NA
total_df[["TYPE"]] <- "total"
total_df[["TYPE_NR"]] <- as.integer(index)
total_df[["DEPTH"]] <- as.integer(0)
# Add data frame to list to add them together at the end
all_results[["total"]] <- total_df
monitor_df <- monitor_df |> monitor_end()
# If grouping variables where defined: compute every possible combination
# of these variables. Starting with single combinations, then double,
# triple, and so on.
if (length(group_vars) > 0) {
index <- index + 1
message("\n > Executing combination merge:")
for (i in seq_along(group_vars)){
combinations <- utils::combn(group_vars, i, simplify = FALSE)
# If only combinations of depth 0 and 1 should be generated break out of the loop
if (tolower(nesting) == "single" && i > 1){
break
}
for (combination in combinations){
# If types are specified check first if combination is of the right type
if (length(types) > 0){
monitor_df <- monitor_df |>
monitor_start(paste0("Drop Combo(", paste(combination, collapse = " + "), ")"),
"Drop Combo")
# Order current combination alphabetically
reordered_combo <- paste(sort(combination), collapse = "+")
# Check if current combination is part of the specified types.
# If not jump to next combination.
if (!reordered_combo %in% reordered_types){
next
}
monitor_df <- monitor_df |> monitor_end()
}
monitor_df <- monitor_df |>
monitor_start(paste0("Formats(", paste(combination, collapse = " + "), ")"),
paste0("Formats(", paste(combination, collapse = " + "), ")"))
# Remove NAs from grouping variables
if (na.rm){
data_frame <- original_df[stats::complete.cases(original_df[combination]), ]
}
message(" + ", paste(combination, collapse = " + "))
get_group_missings(original_df[combination], notes, na.rm)
if (flag_shortcut){
# Convert numeric variables back which have become characters during summarisation
# and apply formats
group_df <- data_frame |>
convert_numeric(combination) |>
apply_format(formats, combination)
# Final summarise with formatted data frame
group_df <- group_df |>
collapse::fgroup_by(combination) |>
collapse::fsummarise(across(values, collapse::fsum))
monitor_df <- monitor_df |> monitor_end()
group_list <- list(group_df, monitor_df)
fast_pct <- TRUE
}
else{
# Apply formats first
group_df <- data_frame |>
keep(combination, values, weight_var) |>
apply_format(formats, combination)
monitor_df <- monitor_df |> monitor_end()
# If there are multiple evaluation types given, every variable gets an
# extension to it's variable name
# If there is only one evaluation type given, the original variable names
# are kept without extension
group_list <- group_df |>
matrix_summarise(values,
combination,
group_df[[weight_var]],
selected_stats,
list_of_statistics,
monitor_df)
fast_pct <- FALSE
}
# Compute percentages
if (length(types) == 0){
if (length(combination) == 1){
# If there is only one grouping variable the super group will be
# NULL and therefore can't be grouped. The super total therefore
# is simply the grand total.
group_list <- compute_total_percentages_short(group_list[[1]],
total_df_copy,
statistics,
sum_columns,
combination,
"pct_group",
group_list[[2]])
}
else{
# Evaluate the group percentages based on the super group.
# Since the depending super group is already computed
# in one of the previous iterations we can make use of
# that and simply join the super_df accordingly.
super_group <- paste(combination[-length(combination)], collapse = "+")
group_list <- compute_group_percentages_short(group_list[[1]],
all_results[[super_group]],
statistics,
combination,
sum_columns,
group_list[[2]],
fast_pct)
}
}
# When types are specified
else{
group_list <- compute_group_percentages(data_frame,
group_list[[1]],
statistics,
combination,
formats,
values,
weight_var,
list_of_statistics,
group_list[[2]],
fast_pct)
}
group_list <- compute_total_percentages_short(group_list[[1]],
total_df_copy,
statistics,
sum_columns,
combination,
"pct_total",
group_list[[2]])
# Split results and monitor
group_df <- group_list[[1]]
monitor_df <- group_list[[2]]
monitor_df <- monitor_df |>
monitor_start(paste0("Finish(", paste(combination, collapse = " + "), ")"),
paste0("Finish(", paste(combination, collapse = " + "), ")"))
# Every grouping variable which was not part of the current grouping
# gets set to a missing value
missing_vars <- setdiff(group_vars, combination)
group_df[missing_vars] <- NA
type <- paste(combination, collapse = "+")
group_df[["TYPE"]] <- type
group_df[["TYPE_NR"]] <- as.integer(index)
group_df[["DEPTH"]] <- as.integer(i)
# Add data frame to list to add them together at the end
all_results[[type]] <- group_df
index <- index + 1
monitor_df <- monitor_df |> monitor_end()
}
}
}
message("\n > Putting results together.")
monitor_df <- monitor_df |> monitor_start("Clean up", "Clean up")
# Put all computed data frames one below the other and sort the variables
# in the order: groups -> types -> results
result_df <- data.table::rbindlist(all_results, fill = TRUE)
# Reorder new variables in order of requested statistics
result_df <- result_df |>
data.table::setcolorder(c(group_vars, "TYPE", "TYPE_NR", "DEPTH")) |>
handle_sum_drops(statistics) |>
reorder_summarised_columns(requested)
# Automatically fuse variables of depth 0 and 1 if only single nested combinations
# were generated
if (tolower(nesting) == "single"){
result_df <- result_df |>
fuse_variables("fused_vars", group_vars)
}
# If types are defined, remove total if not defined in types
if (length(types) > 0 && !"total" %in% types){
result_df <- result_df |> collapse::fsubset(TYPE != "total")
}
monitor_df <- monitor_df |> monitor_end()
monitor_df |> monitor_plot(by = "group", draw_plot = monitor)
}
# Drop pseudo group variable if there is one
if (any(class == "pseudo_class")){
result_df <- result_df |> dropp(class)
}
end_time <- round(difftime(Sys.time(), start_time, units = "secs"), 3)
message("\n- - - 'summarise_plus' execution time: ", end_time, " seconds\n")
result_df
}
###############################################################################
# List of all possible actions
###############################################################################
static_statistics <- list(sum = collapse::fsum,
freq = function(x, w, g) collapse::fnobs(x, g = g),
freq_g0 = function(x, w, g) freq_g0_qol(x, group = g),
mean = collapse::fmean,
median = collapse::fmedian,
mode = collapse::fmode,
min = function(x, w, g) collapse::fmin(x, g = g),
max = function(x, w, g) collapse::fmax(x, g = g),
sd = collapse::fsd,
variance = collapse::fvar,
first = function(x, w, g) collapse::ffirst(x, g = g),
last = function(x, w, g) collapse::flast(x, g = g),
missing = function(x, w, g) collapse::fsum(is.na(x), g = g))
#' Compute Percentile Functions
#'
#' @description
#' Computes percentile functions and adds them to the list of statistical functions.
#'
#' @param statistics User provided statistics.
#'
#' @return
#' Returns a complete list of named statistical functions.
#'
#' @noRd
get_complete_statistics_list <- function(statistics){
all_stats <- static_statistics
for (stat_name in statistics) {
# Match pattern p<number>
if (grepl("^p\\d+$", stat_name)) {
prob <- as.numeric(sub("^p", "", stat_name)) / 100
if (prob > 1){
message(" ! WARNING: Percentiles are only possible from p0 to p100. ", stat_name, " will be omited.")
next
}
# Define the function
all_stats[[stat_name]] <- (function(prob) {
force(prob)
function(x, w = NULL, g = NULL) {
percentiles_qol(x, w, g, probs = prob)
}
})(prob)
}
}
all_stats
}
###############################################################################
# Turns data frame into matrix for fast computation. Evaluates statistics and
# turns everything back into a data frame.
###############################################################################
#' Core Summarise
#'
#' @description
#' This is the core summarisation process. The input data frame is split up in a
#' matrix of values and the grouping variables to make summarisation faster. At
#' the end everything is put back together as a data table.
#'
#' @param data_frame The data frame to be summarised.
#' @param values A vector containing all variables that should be summarised.
#' @param group_vars A vector containing all grouping variables.
#' @param weight Weight variable for weighted results.
#' @param statistics User specified statistics.
#' @param list_of_statistics A list containing all possible statistic functions.
#' @param monitor_df Data frame which stores the monitoring values.
#'
#' @return
#' Returns a list with the summarised data table and the monitoring data frame.
#'
#' @noRd
matrix_summarise <- function(data_frame,
values,
group_vars,
weight,
statistics,
list_of_statistics,
monitor_df){
monitor_df <- monitor_df |> monitor_start("Matrix conversion", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))
# Temporarily rename "." in factor variable levels, if there are any. This is
# necessary because later the matrix row names need to be split by ".". If there
# are any additional dots in the level names, this leads to errors.
for (column in names(data_frame[, group_vars])){
if (is.factor(data_frame[[column]])){
levels(data_frame[[column]]) <- gsub("\\.", "!!!", levels(data_frame[[column]]))
}
}
# Convert the value columns of the data frame into a matrix
value_matrix <- as.matrix(data_frame[, values, with = FALSE])
# Create group
if (is.null(group_vars)){
# In case there is no grouping (e.g. total percentages) create pseudo group
group_vars <- "pseudo_group"
data_frame[[group_vars]] <- 1
grouping <- collapse::GRP(data_frame, group_vars)
}
else{
grouping <- collapse::GRP(data_frame, group_vars)
}
monitor_df <- monitor_df |> monitor_end()
# Compute statistics and put results into a list
result_list <- lapply(names(statistics), function(single_stat){
# Skip sum and do it separately to keep group text information.
if (single_stat != "sum"){
monitor_df <- monitor_df |> monitor_start(single_stat, paste0("Calc(", paste(group_vars, collapse = " + "), ")"))
# Get functions one by one from the global list
stat_function <- list_of_statistics[[single_stat]]
# Do computation as matrix and put results back into a data table
stat_result <- data.table::as.data.table(
stat_function(value_matrix, w = weight, g = grouping))
# Put stat name at the end of variable name
if (nrow(stat_result) > 0){
data.table::setnames(stat_result, paste0(values, "_", single_stat))
}
monitor_df <- monitor_df |> monitor_end()
list(stat_result, monitor_df[nrow(monitor_df), ])
}
})
# Separate results from monitoring
monitor_list <- lapply(result_list, `[[`, 2)
monitor_df <- rbind(monitor_df,
do.call(rbind, monitor_list))
monitor_df <- monitor_df |> monitor_start("sum", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))
result_list <- lapply(result_list, `[[`, 1)
# Do sum separately as in the loop stated
stat_function <- list_of_statistics[["sum"]]
sum_result <- stat_function(value_matrix, w = weight, g = grouping)
# Restore grouping variables from combined matrix row names (format a.b.c.d)
# by splitting them up and transposing them back to columns. Also restore
# their actual variable names.
# This has to be done only once because every value and statistic uses the
# same grouping.
restored_group <- data.table::as.data.table(
data.table::tstrsplit(rownames(sum_result), split = ".", fixed = TRUE))
# Restore temporarily renamed dots
for (column in names(restored_group)){
if (is.character(restored_group[[column]])){
restored_group[[column]] <- gsub("\\.", "!!!", restored_group[[column]])
}
}
if (length(names(restored_group)) > length(group_vars)){
message(" X ERROR: One of the grouping variables is not suited for grouping.")
restored_group <- restored_group[-2]
}
data.table::setnames(restored_group, group_vars)
restored_group[restored_group == "NA"] <- NA
# Convert sum matrix to data.table
sum_result <- data.table::as.data.table(sum_result)
data.table::setnames(sum_result, paste0(values, "_sum"))
# Do weight of sums separately because this only needs to be computed once
# and not per value variable.
monitor_df <- monitor_df |> monitor_next("sum_wgt", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))
sum_wgt <- data.table::as.data.table(
sum_wgt_qol(values = weight, group = grouping))
data.table::setnames(sum_wgt, "sum_wgt")
# Combine grouping variables with results to a full data table
monitor_df <- monitor_df |> monitor_next("Combine evaluations", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))
if (!"pseudo_group" %in% names(restored_group)){
# Normal case with grouping variables
result_df <- cbind(restored_group,
sum_wgt, sum_result,
do.call(cbind, result_list))
}
else{
# Case where there are no grouping variables
result_df <- cbind(sum_wgt, sum_result,
do.call(cbind, result_list))
}
monitor_df <- monitor_df |> monitor_end()
list(result_df, monitor_df)
}
###############################################################################
# Handle what happens with automatically generated sum variables
###############################################################################
#' Drop Sum Stats
#'
#' @description
#' Drops sum and sum_wgt variables if not specified by the user.
#'
#' @param result_df The result data frame which contains the sum variables.
#' @param statistics The user provided statistics to check whether sum stats are
#' provided.
#'
#' @return
#' Returns an adjusted result data frame.
#'
#' @noRd
handle_sum_drops <- function(result_df, statistics){
# Handle sum and sum of weights
# @Speed: Probably not ideal to always generate sum and sum of weights
# and delete it conditionally, but it can be handled with less code this way.
# Also sum so so fast even with big data that it shouldn't make much difference.
if (!"sum_wgt" %in% statistics){
result_df <- result_df |> dropp("sum_wgt")
}
sum_columns <- grep("_sum$", names(result_df), value = TRUE)
if (!"sum" %in% statistics){
result_df <- result_df |> dropp(sum_columns)
}
result_df
}
###############################################################################
# After summarise the new variables are ordered like the list_of_statistics.
# This function reorders them in the order which the user has entered on
# summarise_plus function call.
###############################################################################
#' Order Columns by Stats
#'
#' @description
#' Order the value columns of the final result data frame by stats as provided
#' by the user.
#'
#' @param data_frame The result data frame which contains all variables.
#' @param requested_stats The user provided statistics to get the order from.
#'
#' @return
#' Returns a reordered data frame.
#'
#' @noRd
reorder_summarised_columns <- function(data_frame, requested_stats){
# Select all summarised variables in blocks of the requested statistics
ordered_cols <- unlist(lapply(requested_stats, function(single_stat){
# Concatenates the variable ending like "_sum", "_mean", etc. and selects
# all variables within the given data frame and returns them
grep(paste0(single_stat, "$"),
names(data_frame),
value = TRUE)
}))
ordered_cols <- unique(ordered_cols)
# Put the ordered columns at the end of the data frame
data_frame |> data.table::setcolorder(ordered_cols, after = ncol(data_frame))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.