R/statistics.R
In qol: Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs

#' Sum of Weights
#'
#' @description
#' If specifying this stat with a provided weight variable, the sum of all weights
#' is computed.
#'
#' @param values The weights to sum up.
#' @param group Grouping variables.
#'
#' @return
#' Returns the sum of weights.
#'
#' @noRd
sum_wgt_qol <- function(values, group){
    # Sum up only the weight values
    collapse::fsum(x = values, g = group)
}


#' Unweighted Frequencies of Values Greater Than Zero
#'
#' @description
#' This stat always computes unweighted sums, completely ignoring provided weights.
#' It also just counts the values which are greater than 0.
#'
#' @param values The values to sum up.
#' @param group Grouping variables.
#'
#' @return
#' Returns uweighted frequencies of values greater than zero.
#'
#' @noRd
freq_g0_qol <- function(values, group){
    # Create a vector of same size as input vector and make every value a 1
    # that is greater than 0.
    # Sum up this new vector to get unweighted frequencies.
    values[values > 0] <- 1

    collapse::fsum(values, g = group)
}


#' Calculate Any Percentile
#'
#' @description
#' Calculates any percentile by grouping variables.
#'
#' @param values The values for which to compute percentiles.
#' @param weight A weighting variable.
#' @param group Grouping variables.
#' @param probs The percentiles that should be computed.
#'
#' @return
#' Returns weighted percentiles.
#'
#' @noRd
percentiles_qol <- function(values, weight, group, probs){
    if (anyNA(values)){
        message(" ! WARNING: To calculate percentiles there may be no NAs in the value variables.")
        return(NULL)
    }

    collapse::BY(values, g = group, collapse::fquantile, w = weight, probs = probs)
}


#' Get Missings of Combined Grouping Variables
#'
#' @description
#' Calculate the number of missings, which is generated by the combination of all
#' grouping variables.
#'
#' @param group_vars Grouping variables.
#' @param notes Flag if missings should be computed and message should be printed.
#' @param na.rm Flag if NA values are removed or not.
#'
#' @return
#' Returns a message stating the number of missing values based on grouping variables.
#'
#' @noRd
get_group_missings <- function(group_vars, notes, na.rm){
    if (na.rm){
        return(FALSE)
    }

    if (!notes){
        return(FALSE)
    }

    # Count combined missings of group variables. To count as missings only
    # one of the variables has to be NA in a position.
    missings <- collapse::fsum(!stats::complete.cases(group_vars))

    # If there are missings calculate percentage and output a message
    if (missings > 0){
        nobs      <- length(group_vars[[1]])
        percent   <- round(missings * 100 / nobs, 1)
        none_miss <- nobs - missings

        message(" ~ NOTE: ", format(missings, big.mark = ".", decimal.mark = ","),
                " missings generated from grouping variables (", percent,
                " %). Number of observations: ", format(none_miss, big.mark = ".", decimal.mark = ","), "/", format(nobs, big.mark = ".", decimal.mark = ","))
    }
}


#' Compute Percentages by Group
#'
#' @description
#' Calculate the percentages inside a specific group of variables by generating the
#' super group first and joining the super group totals back to the data frame.
#'
#' @param original_df The original unchanged data frame provided by the user.
#' @param summary_df Already summed up result data frame to build up upon.
#' @param statistics User provided statistics.
#' @param group_vars Grouping variables.
#' @param formats Provided list of variables with formats to apply.
#' @param values he values for which to compute percentages.
#' @param weight Weighting variable.
#' @param list_of_statistics A list of all statistics that can be computed.
#' @param monitor_df Data frame which stores the monitoring values.
#' @param fast_pct Flag that states whether the function can take a shortcut for faster
#' calculation.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_group_percentages <- function(original_df,
                                      summary_df,
                                      statistics,
                                      group_vars,
                                      formats,
                                      values,
                                      weight,
                                      list_of_statistics,
                                      monitor_df,
                                      fast_pct){
    if (!"pct_group" %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start("pct_group", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))

    # Create pseudo group variable for totals
    original_df[[".temp_key"]] <- 1
    summary_df[[".temp_key"]]  <- 1

    # Get rid of the last grouping variable to get the grouping above the
    # desired grouping
    if (length(group_vars) > 1){
        super_group <- group_vars[-length(group_vars)]
    }
    else{
        super_group <- ".temp_key"
    }

    cut_off_var <- group_vars[length(group_vars)]

    # If no fast percentage (means original_df is the full data frame)
    if (!fast_pct){
        # It is faster to summarise first without formats
        # and then apply the formats to a much smaller data frame.
        result_list <- original_df |>
            matrix_summarise(values,
                             super_group,
                             original_df[[weight]],
                             NULL,
                             list_of_statistics,
                             monitor_df)

        super_df <- result_list[[1]]
        rm(result_list)

        # Convert numeric variables back which have become characters during summarisation
        # and apply formats
        super_df <- super_df |>
            convert_numeric(super_group) |>
            apply_format(formats, super_group) |>
            dropp("sum_wgt")

        # Catch new variable names
        values <- super_df |> inverse(group_vars)
    }
    # If fast percentage (means original_df is already summarised)
    else{
        values <- values[grepl("_sum$", values)]

        # Summarise first
        super_df <- original_df |>
            collapse::fgroup_by(super_group) |>
            collapse::fsummarise(across(values, collapse::fsum)) |>
            convert_numeric(super_group)

        super_df <- super_df |>
            apply_format(formats, super_group)
    }

    # Generate new variable names for super_group totals
    new_names <- paste0(values, "_qol")
    old_names <- values

    # Final summarise with formatted data frame
    super_df <- super_df |>
        collapse::fgroup_by(super_group) |>
        collapse::fsummarise(across(values, collapse::fsum)) |>
        collapse::frename(stats::setNames(old_names, new_names)) |>
        keep(super_group, new_names)

    # Join super_df to summarized data frame
    if (is.null(super_group)){
        # In case of only totals
        joined_df <- merge(summary_df, super_df,
                           by = ".temp_key",
                           allow.cartesian = TRUE)
    }
    else{
        # In case there is at least one grouping variable
        joined_df <- collapse::join(summary_df, super_df,
                                    on = super_group,
                                    how = "left",
                                    verbose = FALSE)
    }

    # Return finished data frame
    joined_df <- joined_df |>
        calculate_percentages(values, "pct_group", cut_off_var) |>
        dropp(".temp_key")

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)
}


#' Compute Percentages by Group
#'
#' @description
#' Calculate the percentages inside a specific group of variables, which builds upon
#' a data frame which already carries the summarised super groups.
#'
#' @param summary_df Already summed up result data frame to build up upon.
#' @param super_df Already summed up super group results.
#' @param statistics User provided statistics.
#' @param group_vars Grouping variables.
#' @param values he values for which to compute percentages.
#' @param monitor_df Data frame which stores the monitoring values.
#' @param fast_pct Flag that states whether the function can take a shortcut for faster
#' calculation.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_group_percentages_short <- function(summary_df,
                                            super_df,
                                            statistics,
                                            group_vars,
                                            values,
                                            monitor_df,
                                            fast_pct){
    if (!"pct_group" %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start("pct_group", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))

    # Get rid of the last grouping variable to get the grouping above the
    # desired grouping
    super_group <- group_vars[-length(group_vars)]

    # Generate new variable names for super_group totals
    old_names <- values
    new_names <- paste0(values, "_qol")

    super_df <- super_df |>
        keep(super_group, old_names) |>
        collapse::frename(stats::setNames(old_names, new_names))

    # Join data frames by super group
    joined_df <- collapse::join(summary_df, super_df,
                                on = super_group,
                                how = "left",
                                verbose = FALSE)

    # Return finished data frame
    joined_df <- joined_df |>
        calculate_percentages(values, "pct_group", group_vars[length(group_vars)])

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)
}


#' Compute Total Percentages
#'
#' @description
#' Calculate the percentages based on the overall totals.
#'
#' @param original_df The original unchanged data frame provided by the user.
#' @param summary_df Already summed up result data frame to build up upon.
#' @param statistics User provided statistics.
#' @param group_vars Grouping variables.
#' @param values he values for which to compute percentages.
#' @param weight Weighting variable.
#' @param list_of_statistics A list of all statistics that can be computed.
#' @param monitor_df Data frame which stores the monitoring values.
#' @param fast_pct Flag that states whether the function can take a shortcut for faster
#' calculation.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_total_percentages <- function(original_df,
                                      summary_df,
                                      statistics,
                                      group_vars,
                                      values,
                                      weight,
                                      list_of_statistics,
                                      monitor_df,
                                      fast_pct){
    if (!"pct_total" %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start("pct_total", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))

    # Generate new variable names for super_group totals
    new_names <- paste0(values, "_qol")

    # Compute the data frame with summaries of the super groups
    super_df <- original_df |>
        collapse::fungroup()

    # If no fast percentage (means original_df is the full data frame)
    if (!fast_pct){
        result_list <- super_df |>
            matrix_summarise(values,
                             NULL,
                             super_df[[weight]],
                             NULL,
                             list_of_statistics,
                             monitor_df)

        # Split results and monitor
        super_df <- result_list[[1]]

        # Catch new variable names
        values <- super_df |>
            dropp("sum_wgt") |>
            inverse(group_vars)
    }
    # If fast percentage (means original_df is already summarised)
    else{
        values <- values[grepl("_sum$", values)]

        # Simple Summarise
        super_df <- original_df |>
            collapse::fsummarise(across(values, collapse::fsum))
    }

    # Generate new variable names for super_group totals
    new_names <- paste0(values, "_qol")
    old_names <- values

    super_df <- super_df |>
        collapse::frename(stats::setNames(old_names, new_names)) |>
        keep(new_names)

    # Join super_df to summarized data frame
    summary_df[[".temp_key"]] <- 1
    super_df[[".temp_key"]]   <- 1

    joined_df <- merge(summary_df, super_df,
                       by = ".temp_key",
                       allow.cartesian = TRUE)

    # Return finished data frame
    joined_df <- joined_df |>
        calculate_percentages(values, "pct_total", ".temp_key") |>
        dropp(".temp_key")

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)

}


#' Compute Total Percentages
#'
#' @description
#' Calculate the percentages based on the overall totals, which builds upon
#' a data frame which already carries the summarised overall totals.
#'
#' @param summary_df Already summed up result data frame to build up upon.
#' @param total_df Already summed up result data frame to build up upon.
#' @param statistics User provided statistics.
#' @param values he values for which to compute percentages.
#' @param last_group_var The last variable of grouping variables.
#' @param pct_name Name of percentages.
#' @param monitor_df Data frame which stores the monitoring values.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_total_percentages_short <- function(summary_df,
                                            total_df,
                                            statistics,
                                            values,
                                            last_group_var,
                                            pct_name = "pct_total",
                                            monitor_df){
    if (!pct_name %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start(pct_name, paste0("Calc(", paste(last_group_var, collapse = " + "), ")"))

    # Join super_df to summarized data frame
    summary_df[[".temp_key"]] <- 1
    total_df[[".temp_key"]]   <- 1

    joined_df <- merge(summary_df, total_df,
                       by = ".temp_key",
                       allow.cartesian = TRUE)

    # Return finished data frame
    if (pct_name == "pct_total"){
        joined_df <- joined_df |>
            calculate_percentages(values, pct_name, ".temp_key") |>
            dropp(".temp_key")
    }
    else{
        # For group percentages with depth 1
        joined_df <- joined_df |>
            calculate_percentages(values, pct_name, last_group_var) |>
            dropp(".temp_key")
    }

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)

}


#' Compute Percentages
#'
#' @description
#' The main process of calculating percentages.
#'
#' @param joined_df Data frame with joined super group or overall totals.
#' @param values he values for which to compute percentages.
#' @param last_group_var The last variable of grouping variables.
#' @param pct_name Name of percentages.
#'
#' @return
#' Returns a data frame with added percentages.
#'
#' @noRd
calculate_percentages <- function(joined_df, values, pct_name, last_group_var){
    # Specify numerators and denominators
    numerator <- values

    if (!all(values %in% names(joined_df))){
        # In case variable names have the statistic extension at the end
        numerator <- paste0(values, "_sum")
    }

    denominator <- paste0(values, "_qol")
    new_values <- paste0(gsub("_sum$", "", values), "_", pct_name)

    # Compute percentages for every variable
    for (i in seq_along(numerator)) {
        current_num <- numerator[i]
        current_den <- denominator[i]
        current_new_var <- new_values[i]

        # Evaluate percentage and set division by 0 to NA
        result <- joined_df[[current_num]] * 100 / joined_df[[current_den]]
        result[is.nan(result)] <- NA

        joined_df[[current_new_var]] <- result
    }

    joined_df |>
        dropp(denominator)
}
Any scripts or data that you put into this service are public.
qol documentation built on Dec. 14, 2025, 1:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
qol
Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs

R/statistics.R
In qol: Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs

Defines functions calculate_percentages compute_total_percentages_short compute_total_percentages compute_group_percentages_short compute_group_percentages get_group_missings percentiles_qol freq_g0_qol sum_wgt_qol

Try the qol package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

qol Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs

R/statistics.R In qol: Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs

Defines functions calculate_percentages compute_total_percentages_short compute_total_percentages compute_group_percentages_short compute_group_percentages get_group_missings percentiles_qol freq_g0_qol sum_wgt_qol

Try the qol package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

qol
Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs

R/statistics.R
In qol: Powerful 'SAS' Inspired Concepts for more Efficient Bigger Outputs