R/top_perc.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in top_perc

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Select Top Percentage of Data and Statistical Summarization
#'
#' @description
#' The `top_perc` function selects the top percentage of data based on a specified trait and computes summary statistics.
#' It allows for grouping by additional columns and offers flexibility in the type of statistics calculated.
#' The function can also retain the selected data if needed.
#'
#' @param data A `data.frame` containing the source dataset for analysis
#'   - Supports various data frame-like structures
#'   - Automatically converts non-data frame inputs
#'
#' @param perc Numeric vector of percentages for data selection
#'   - Range: `-1` to `1`
#'   - Positive values: Select top percentiles
#'   - Negative values: Select bottom percentiles
#'   - Multiple percentiles supported
#'
#' @param trait Character string specifying the 'selection column'
#'   - Must be a valid column name in the input `data`
#'   - Used as the basis for top/bottom percentage selection
#'
#' @param by Optional character vector for 'grouping columns'
#'   - Default is `NULL`
#'   - Enables stratified analysis
#'   - Allows granular percentage selection within groups
#'
#' @param type Statistical summary type
#'   - Default: `"mean_sd"`
#'   - Controls the type of summary statistics computed
#'   - Supports various summary methods from `rstatix`
#'
#' @param keep_data Logical flag for data retention
#'   - Default: `FALSE`
#'   - `TRUE`: Return both summary statistics and selected data
#'   - `FALSE`: Return only summary statistics
#'
#' @return A list or data frame:
#' \itemize{
#'   \item If `keep_data` is FALSE, a data frame with summary statistics.
#'   \item If `keep_data` is TRUE, a list where each element is a list containing summary statistics (`stat`) and the selected top data (`data`).
#' }
#'
#' @note
#' \itemize{
#'   \item The `perc` parameter accepts values between -1 and 1. Positive values select the top percentage, while negative values select the bottom percentage.
#'   \item The function performs initial checks to ensure required arguments are provided and valid.
#'   \item Grouping by additional columns (`by`) is optional and allows for more granular analysis.
#'   \item The `type` parameter specifies the type of summary statistics to compute, with "mean_sd" as the default.
#'   \item If `keep_data` is set to TRUE, the function will return both the summary statistics and the selected top data for each percentage.
#' }
#'
#' @seealso
#' \itemize{
#'   \item [`rstatix::get_summary_stats()`] Statistical summary computation
#'   \item [`dplyr::top_frac()`] Percentage-based data selection
#' }
#'
#' @importFrom purrr map set_names list_rbind
#' @importFrom dplyr group_by top_frac across all_of mutate
#' @importFrom rstatix get_summary_stats
#' @importFrom rlang sym
#' @export
#' @examples
#' # Example 1: Basic usage with single trait
#' # This example selects the top 10% of observations based on Petal.Width
#' # keep_data=TRUE returns both summary statistics and the filtered data
#' top_perc(iris, 
#'          perc = 0.1,                # Select top 10%
#'          trait = c("Petal.Width"),  # Column to analyze
#'          keep_data = TRUE)          # Return both stats and filtered data
#'
#' # Example 2: Using grouping with 'by' parameter
#' # This example performs the same analysis but separately for each Species
#' # Returns nested list with stats and filtered data for each group
#' top_perc(iris, 
#'          perc = 0.1,                # Select top 10%
#'          trait = c("Petal.Width"),  # Column to analyze
#'          by = "Species")            # Group by Species
#'
#' # Example 3: Complex example with multiple percentages and grouping variables
#' # Reshape data from wide to long format for Sepal.Length and Sepal.Width
#' iris |> 
#'   tidyr::pivot_longer(1:2,
#'                       names_to = "names", 
#'                       values_to = "values") |> 
#'   mintyr::top_perc(
#'     perc = c(0.1, -0.2),
#'     trait = "values",
#'     by = c("Species", "names"),
#'     type = "mean_sd")
top_perc <- function(data, perc, trait, by = NULL, type = "mean_sd", keep_data = FALSE) {
  # Initial checks and data preparation
  missing_args <- c()
  if (missing(data)) missing_args <- c(missing_args, "data")
  if (missing(perc)) missing_args <- c(missing_args, "perc")
  if (missing(trait)) missing_args <- c(missing_args, "trait")
  
  if (length(missing_args) > 0) {
    stop("Error: Missing argument(s): ", paste(missing_args, collapse=", "))
  }
  
  if (!inherits(data, "data.frame")) {
    message("Converting 'data' to data.frame")
    data <- as.data.frame(data)
  }
  
  # Ensure 'perc' is treated as a numeric vector
  perc <- as.numeric(perc)
  if (length(perc) == 0) {
    stop("Error: 'perc' must not be empty.")
  }
  if (any(perc < -1 | perc > 1)) {
    stop("Error: Each element of 'perc' must be a numeric value between -1 and 1.")
  }
  
  # Validate 'trait' parameter
  if (!is.character(trait) || length(trait) != 1) {
    stop("Error: 'trait' must be a single character string.")
  }
  if (!trait %in% names(data)) {
    stop("Error: 'trait' must be a valid column name in 'data'.")
  }
  
  # Validate 'by' parameter if not NULL
  if (!is.null(by)) {
    if (!is.character(by) || length(by) == 0) {
      stop("Error: 'by' must be a character vector of column names in 'data'.")
    }
    if (!all(by %in% names(data))) {
      stop("Error: All elements of 'by' must be valid column names in 'data'.")
    }
  }
  
  # Processing each percentage
  results <- purrr::map(perc, function(p) {
    grouped_data <- if (!is.null(by) && length(by) > 0) {
      data |> dplyr::group_by(dplyr::across(dplyr::all_of(by)))
    } else {
      data
    }
    
    top_data <- grouped_data |>
      dplyr::top_frac(p, !!rlang::sym(trait))
    
    # Always compute stats
    stats <- top_data |>
      rstatix::get_summary_stats(!!rlang::sym(trait), type = type) |>
      dplyr::mutate(top_perc = paste0(p * 100, "%"))
    
    # Return both stats and data if keep_data is TRUE
    if (keep_data) {
      list(stat = stats, data = top_data)
    } else {
      list(stat = stats)
    }
  }) |>
    purrr::set_names(paste(trait, perc, sep = "_"))
  
  # Simplify the output structure based on what is available in each result
  if (keep_data) {
    results
  } else {
    results <- purrr::map(results, "stat") |> purrr::list_rbind()
  }
  
  return(results)
}