R/statistics.R

Defines functions calculate_percentages compute_total_percentages_short compute_total_percentages compute_group_percentages_short compute_group_percentages get_group_missings percentiles_qol freq_g0_qol sum_wgt_qol

#' Sum of Weights
#'
#' @description
#' If specifying this stat with a provided weight variable, the sum of all weights
#' is computed.
#'
#' @param values The weights to sum up.
#' @param group Grouping variables.
#'
#' @return
#' Returns the sum of weights.
#'
#' @noRd
sum_wgt_qol <- function(values, group){
    # Sum up only the weight values
    collapse::fsum(x = values, g = group)
}


#' Unweighted Frequencies of Values Greater Than Zero
#'
#' @description
#' This stat always computes unweighted sums, completely ignoring provided weights.
#' It also just counts the values which are greater than 0.
#'
#' @param values The values to sum up.
#' @param group Grouping variables.
#'
#' @return
#' Returns uweighted frequencies of values greater than zero.
#'
#' @noRd
freq_g0_qol <- function(values, group){
    # Create a vector of same size as input vector and make every value a 1
    # that is greater than 0.
    # Sum up this new vector to get unweighted frequencies.
    values[values > 0] <- 1

    collapse::fsum(values, g = group)
}


#' Calculate Any Percentile
#'
#' @description
#' Calculates any percentile by grouping variables.
#'
#' @param values The values for which to compute percentiles.
#' @param weight A weighting variable.
#' @param group Grouping variables.
#' @param probs The percentiles that should be computed.
#'
#' @return
#' Returns weighted percentiles.
#'
#' @noRd
percentiles_qol <- function(values, weight, group, probs){
    if (anyNA(values)){
        message(" ! WARNING: To calculate percentiles there may be no NAs in the value variables.")
        return(NULL)
    }

    collapse::BY(values, g = group, collapse::fquantile, w = weight, probs = probs)
}


#' Get Missings of Combined Grouping Variables
#'
#' @description
#' Calculate the number of missings, which is generated by the combination of all
#' grouping variables.
#'
#' @param group_vars Grouping variables.
#' @param notes Flag if missings should be computed and message should be printed.
#' @param na.rm Flag if NA values are removed or not.
#'
#' @return
#' Returns a message stating the number of missing values based on grouping variables.
#'
#' @noRd
get_group_missings <- function(group_vars, notes, na.rm){
    if (na.rm){
        return(FALSE)
    }

    if (!notes){
        return(FALSE)
    }

    # Count combined missings of group variables. To count as missings only
    # one of the variables has to be NA in a position.
    missings <- collapse::fsum(!stats::complete.cases(group_vars))

    # If there are missings calculate percentage and output a message
    if (missings > 0){
        nobs      <- length(group_vars[[1]])
        percent   <- round(missings * 100 / nobs, 1)
        none_miss <- nobs - missings

        message(" ~ NOTE: ", format(missings, big.mark = ".", decimal.mark = ","),
                " missings generated from grouping variables (", percent,
                " %). Number of observations: ", format(none_miss, big.mark = ".", decimal.mark = ","), "/", format(nobs, big.mark = ".", decimal.mark = ","))
    }
}


#' Compute Percentages by Group
#'
#' @description
#' Calculate the percentages inside a specific group of variables by generating the
#' super group first and joining the super group totals back to the data frame.
#'
#' @param original_df The original unchanged data frame provided by the user.
#' @param summary_df Already summed up result data frame to build up upon.
#' @param statistics User provided statistics.
#' @param group_vars Grouping variables.
#' @param formats Provided list of variables with formats to apply.
#' @param values he values for which to compute percentages.
#' @param weight Weighting variable.
#' @param list_of_statistics A list of all statistics that can be computed.
#' @param monitor_df Data frame which stores the monitoring values.
#' @param fast_pct Flag that states whether the function can take a shortcut for faster
#' calculation.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_group_percentages <- function(original_df,
                                      summary_df,
                                      statistics,
                                      group_vars,
                                      formats,
                                      values,
                                      weight,
                                      list_of_statistics,
                                      monitor_df,
                                      fast_pct){
    if (!"pct_group" %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start("pct_group", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))

    # Create pseudo group variable for totals
    original_df[[".temp_key"]] <- 1
    summary_df[[".temp_key"]]  <- 1

    # Get rid of the last grouping variable to get the grouping above the
    # desired grouping
    if (length(group_vars) > 1){
        super_group <- group_vars[-length(group_vars)]
    }
    else{
        super_group <- ".temp_key"
    }

    cut_off_var <- group_vars[length(group_vars)]

    # If no fast percentage (means original_df is the full data frame)
    if (!fast_pct){
        # It is faster to summarise first without formats
        # and then apply the formats to a much smaller data frame.
        result_list <- original_df |>
            matrix_summarise(values,
                             super_group,
                             original_df[[weight]],
                             NULL,
                             list_of_statistics,
                             monitor_df)

        super_df <- result_list[[1]]
        rm(result_list)

        # Convert numeric variables back which have become characters during summarisation
        # and apply formats
        super_df <- super_df |>
            convert_numeric(super_group) |>
            apply_format(formats, super_group) |>
            dropp("sum_wgt")

        # Catch new variable names
        values <- super_df |> inverse(group_vars)
    }
    # If fast percentage (means original_df is already summarised)
    else{
        values <- values[grepl("_sum$", values)]

        # Summarise first
        super_df <- original_df |>
            collapse::fgroup_by(super_group) |>
            collapse::fsummarise(across(values, collapse::fsum)) |>
            convert_numeric(super_group)

        super_df <- super_df |>
            apply_format(formats, super_group)
    }

    # Generate new variable names for super_group totals
    new_names <- paste0(values, "_qol")
    old_names <- values

    # Final summarise with formatted data frame
    super_df <- super_df |>
        collapse::fgroup_by(super_group) |>
        collapse::fsummarise(across(values, collapse::fsum)) |>
        collapse::frename(stats::setNames(old_names, new_names)) |>
        keep(super_group, new_names)

    # Join super_df to summarized data frame
    if (is.null(super_group)){
        # In case of only totals
        joined_df <- merge(summary_df, super_df,
                           by = ".temp_key",
                           allow.cartesian = TRUE)
    }
    else{
        # In case there is at least one grouping variable
        joined_df <- collapse::join(summary_df, super_df,
                                    on = super_group,
                                    how = "left",
                                    verbose = FALSE)
    }

    # Return finished data frame
    joined_df <- joined_df |>
        calculate_percentages(values, "pct_group", cut_off_var) |>
        dropp(".temp_key")

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)
}


#' Compute Percentages by Group
#'
#' @description
#' Calculate the percentages inside a specific group of variables, which builds upon
#' a data frame which already carries the summarised super groups.
#'
#' @param summary_df Already summed up result data frame to build up upon.
#' @param super_df Already summed up super group results.
#' @param statistics User provided statistics.
#' @param group_vars Grouping variables.
#' @param values he values for which to compute percentages.
#' @param monitor_df Data frame which stores the monitoring values.
#' @param fast_pct Flag that states whether the function can take a shortcut for faster
#' calculation.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_group_percentages_short <- function(summary_df,
                                            super_df,
                                            statistics,
                                            group_vars,
                                            values,
                                            monitor_df,
                                            fast_pct){
    if (!"pct_group" %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start("pct_group", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))

    # Get rid of the last grouping variable to get the grouping above the
    # desired grouping
    super_group <- group_vars[-length(group_vars)]

    # Generate new variable names for super_group totals
    old_names <- values
    new_names <- paste0(values, "_qol")

    super_df <- super_df |>
        keep(super_group, old_names) |>
        collapse::frename(stats::setNames(old_names, new_names))

    # Join data frames by super group
    joined_df <- collapse::join(summary_df, super_df,
                                on = super_group,
                                how = "left",
                                verbose = FALSE)

    # Return finished data frame
    joined_df <- joined_df |>
        calculate_percentages(values, "pct_group", group_vars[length(group_vars)])

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)
}


#' Compute Total Percentages
#'
#' @description
#' Calculate the percentages based on the overall totals.
#'
#' @param original_df The original unchanged data frame provided by the user.
#' @param summary_df Already summed up result data frame to build up upon.
#' @param statistics User provided statistics.
#' @param group_vars Grouping variables.
#' @param values he values for which to compute percentages.
#' @param weight Weighting variable.
#' @param list_of_statistics A list of all statistics that can be computed.
#' @param monitor_df Data frame which stores the monitoring values.
#' @param fast_pct Flag that states whether the function can take a shortcut for faster
#' calculation.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_total_percentages <- function(original_df,
                                      summary_df,
                                      statistics,
                                      group_vars,
                                      values,
                                      weight,
                                      list_of_statistics,
                                      monitor_df,
                                      fast_pct){
    if (!"pct_total" %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start("pct_total", paste0("Calc(", paste(group_vars, collapse = " + "), ")"))

    # Generate new variable names for super_group totals
    new_names <- paste0(values, "_qol")

    # Compute the data frame with summaries of the super groups
    super_df <- original_df |>
        collapse::fungroup()

    # If no fast percentage (means original_df is the full data frame)
    if (!fast_pct){
        result_list <- super_df |>
            matrix_summarise(values,
                             NULL,
                             super_df[[weight]],
                             NULL,
                             list_of_statistics,
                             monitor_df)

        # Split results and monitor
        super_df <- result_list[[1]]

        # Catch new variable names
        values <- super_df |>
            dropp("sum_wgt") |>
            inverse(group_vars)
    }
    # If fast percentage (means original_df is already summarised)
    else{
        values <- values[grepl("_sum$", values)]

        # Simple Summarise
        super_df <- original_df |>
            collapse::fsummarise(across(values, collapse::fsum))
    }

    # Generate new variable names for super_group totals
    new_names <- paste0(values, "_qol")
    old_names <- values

    super_df <- super_df |>
        collapse::frename(stats::setNames(old_names, new_names)) |>
        keep(new_names)

    # Join super_df to summarized data frame
    summary_df[[".temp_key"]] <- 1
    super_df[[".temp_key"]]   <- 1

    joined_df <- merge(summary_df, super_df,
                       by = ".temp_key",
                       allow.cartesian = TRUE)

    # Return finished data frame
    joined_df <- joined_df |>
        calculate_percentages(values, "pct_total", ".temp_key") |>
        dropp(".temp_key")

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)

}


#' Compute Total Percentages
#'
#' @description
#' Calculate the percentages based on the overall totals, which builds upon
#' a data frame which already carries the summarised overall totals.
#'
#' @param summary_df Already summed up result data frame to build up upon.
#' @param total_df Already summed up result data frame to build up upon.
#' @param statistics User provided statistics.
#' @param values he values for which to compute percentages.
#' @param last_group_var The last variable of grouping variables.
#' @param pct_name Name of percentages.
#' @param monitor_df Data frame which stores the monitoring values.
#'
#' @return
#' Returns a data frame with added percentages, if the statistic was specified.
#'
#' @noRd
compute_total_percentages_short <- function(summary_df,
                                            total_df,
                                            statistics,
                                            values,
                                            last_group_var,
                                            pct_name = "pct_total",
                                            monitor_df){
    if (!pct_name %in% statistics){
        return(list(summary_df, monitor_df))
    }

    monitor_df <- monitor_df |> monitor_start(pct_name, paste0("Calc(", paste(last_group_var, collapse = " + "), ")"))

    # Join super_df to summarized data frame
    summary_df[[".temp_key"]] <- 1
    total_df[[".temp_key"]]   <- 1

    joined_df <- merge(summary_df, total_df,
                       by = ".temp_key",
                       allow.cartesian = TRUE)

    # Return finished data frame
    if (pct_name == "pct_total"){
        joined_df <- joined_df |>
            calculate_percentages(values, pct_name, ".temp_key") |>
            dropp(".temp_key")
    }
    else{
        # For group percentages with depth 1
        joined_df <- joined_df |>
            calculate_percentages(values, pct_name, last_group_var) |>
            dropp(".temp_key")
    }

    monitor_df <- monitor_df |> monitor_end()

    list(joined_df, monitor_df)

}


#' Compute Percentages
#'
#' @description
#' The main process of calculating percentages.
#'
#' @param joined_df Data frame with joined super group or overall totals.
#' @param values he values for which to compute percentages.
#' @param last_group_var The last variable of grouping variables.
#' @param pct_name Name of percentages.
#'
#' @return
#' Returns a data frame with added percentages.
#'
#' @noRd
calculate_percentages <- function(joined_df, values, pct_name, last_group_var){
    # Specify numerators and denominators
    numerator <- values

    if (!all(values %in% names(joined_df))){
        # In case variable names have the statistic extension at the end
        numerator <- paste0(values, "_sum")
    }

    denominator <- paste0(values, "_qol")
    new_values <- paste0(gsub("_sum$", "", values), "_", pct_name)

    # Compute percentages for every variable
    for (i in seq_along(numerator)) {
        current_num <- numerator[i]
        current_den <- denominator[i]
        current_new_var <- new_values[i]

        # Evaluate percentage and set division by 0 to NA
        result <- joined_df[[current_num]] * 100 / joined_df[[current_den]]
        result[is.nan(result)] <- NA

        joined_df[[current_new_var]] <- result
    }

    joined_df |>
        dropp(denominator)
}

Try the qol package in your browser

Any scripts or data that you put into this service are public.

qol documentation built on Dec. 14, 2025, 1:06 a.m.