Nothing
# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand
#' Select Top Percentage of Data and Statistical Summarization
#'
#' @description
#' The `top_perc` function selects the top percentage of data based on a specified trait and computes summary statistics.
#' It allows for grouping by additional columns and offers flexibility in the type of statistics calculated.
#' The function can also retain the selected data if needed.
#'
#' @param data A `data.frame` containing the source dataset for analysis
#' - Supports various data frame-like structures
#' - Automatically converts non-data frame inputs
#'
#' @param perc Numeric vector of percentages for data selection
#' - Range: `-1` to `1`
#' - Positive values: Select top percentiles
#' - Negative values: Select bottom percentiles
#' - Multiple percentiles supported
#'
#' @param trait Character string specifying the 'selection column'
#' - Must be a valid column name in the input `data`
#' - Used as the basis for top/bottom percentage selection
#'
#' @param by Optional character vector for 'grouping columns'
#' - Default is `NULL`
#' - Enables stratified analysis
#' - Allows granular percentage selection within groups
#'
#' @param type Statistical summary type
#' - Default: `"mean_sd"`
#' - Controls the type of summary statistics computed
#' - Supports various summary methods from `rstatix`
#'
#' @param keep_data Logical flag for data retention
#' - Default: `FALSE`
#' - `TRUE`: Return both summary statistics and selected data
#' - `FALSE`: Return only summary statistics
#'
#' @return A list or data frame:
#' \itemize{
#' \item If `keep_data` is FALSE, a data frame with summary statistics.
#' \item If `keep_data` is TRUE, a list where each element is a list containing summary statistics (`stat`) and the selected top data (`data`).
#' }
#'
#' @note
#' \itemize{
#' \item The `perc` parameter accepts values between -1 and 1. Positive values select the top percentage, while negative values select the bottom percentage.
#' \item The function performs initial checks to ensure required arguments are provided and valid.
#' \item Grouping by additional columns (`by`) is optional and allows for more granular analysis.
#' \item The `type` parameter specifies the type of summary statistics to compute, with "mean_sd" as the default.
#' \item If `keep_data` is set to TRUE, the function will return both the summary statistics and the selected top data for each percentage.
#' }
#'
#' @seealso
#' \itemize{
#' \item [`rstatix::get_summary_stats()`] Statistical summary computation
#' \item [`dplyr::top_frac()`] Percentage-based data selection
#' }
#'
#' @importFrom purrr map set_names list_rbind
#' @importFrom dplyr group_by top_frac across all_of mutate
#' @importFrom rstatix get_summary_stats
#' @importFrom rlang sym
#' @export
#' @examples
#' # Example 1: Basic usage with single trait
#' # This example selects the top 10% of observations based on Petal.Width
#' # keep_data=TRUE returns both summary statistics and the filtered data
#' top_perc(iris,
#' perc = 0.1, # Select top 10%
#' trait = c("Petal.Width"), # Column to analyze
#' keep_data = TRUE) # Return both stats and filtered data
#'
#' # Example 2: Using grouping with 'by' parameter
#' # This example performs the same analysis but separately for each Species
#' # Returns nested list with stats and filtered data for each group
#' top_perc(iris,
#' perc = 0.1, # Select top 10%
#' trait = c("Petal.Width"), # Column to analyze
#' by = "Species") # Group by Species
#'
#' # Example 3: Complex example with multiple percentages and grouping variables
#' # Reshape data from wide to long format for Sepal.Length and Sepal.Width
#' iris |>
#' tidyr::pivot_longer(1:2,
#' names_to = "names",
#' values_to = "values") |>
#' mintyr::top_perc(
#' perc = c(0.1, -0.2),
#' trait = "values",
#' by = c("Species", "names"),
#' type = "mean_sd")
top_perc <- function(data, perc, trait, by = NULL, type = "mean_sd", keep_data = FALSE) {
# Initial checks and data preparation
missing_args <- c()
if (missing(data)) missing_args <- c(missing_args, "data")
if (missing(perc)) missing_args <- c(missing_args, "perc")
if (missing(trait)) missing_args <- c(missing_args, "trait")
if (length(missing_args) > 0) {
stop("Error: Missing argument(s): ", paste(missing_args, collapse=", "))
}
if (!inherits(data, "data.frame")) {
message("Converting 'data' to data.frame")
data <- as.data.frame(data)
}
# Ensure 'perc' is treated as a numeric vector
perc <- as.numeric(perc)
if (length(perc) == 0) {
stop("Error: 'perc' must not be empty.")
}
if (any(perc < -1 | perc > 1)) {
stop("Error: Each element of 'perc' must be a numeric value between -1 and 1.")
}
# Validate 'trait' parameter
if (!is.character(trait) || length(trait) != 1) {
stop("Error: 'trait' must be a single character string.")
}
if (!trait %in% names(data)) {
stop("Error: 'trait' must be a valid column name in 'data'.")
}
# Validate 'by' parameter if not NULL
if (!is.null(by)) {
if (!is.character(by) || length(by) == 0) {
stop("Error: 'by' must be a character vector of column names in 'data'.")
}
if (!all(by %in% names(data))) {
stop("Error: All elements of 'by' must be valid column names in 'data'.")
}
}
# Processing each percentage
results <- purrr::map(perc, function(p) {
grouped_data <- if (!is.null(by) && length(by) > 0) {
data |> dplyr::group_by(dplyr::across(dplyr::all_of(by)))
} else {
data
}
top_data <- grouped_data |>
dplyr::top_frac(p, !!rlang::sym(trait))
# Always compute stats
stats <- top_data |>
rstatix::get_summary_stats(!!rlang::sym(trait), type = type) |>
dplyr::mutate(top_perc = paste0(p * 100, "%"))
# Return both stats and data if keep_data is TRUE
if (keep_data) {
list(stat = stats, data = top_data)
} else {
list(stat = stats)
}
}) |>
purrr::set_names(paste(trait, perc, sep = "_"))
# Simplify the output structure based on what is available in each result
if (keep_data) {
results
} else {
results <- purrr::map(results, "stat") |> purrr::list_rbind()
}
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.