R/c2p_nest.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in c2p_nest

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Column to Pair Nested Transformation
#'
#' @description
#' A sophisticated data transformation tool for generating column pair combinations 
#' and creating nested data structures with advanced configuration options.
#'
#' @param data Input `data frame` or `data table`
#'   - Must contain valid columns for transformation
#'   - Supports multiple data types
#'
#' @param cols2bind Column specification for pair generation
#'   - Can be a `character` vector of column names
#'   - Can be a `numeric` vector of column indices
#'   - Must reference existing columns in the dataset
#'
#' @param by Optional grouping specification
#'   - Can be a `character` vector of column names
#'   - Can be a `numeric` vector of column indices
#'   - Enables hierarchical nested transformations
#'   - Supports multi-level aggregation
#'   - Default is `NULL`
#'
#' @param pairs_n `numeric` indicating combination size
#'   - Minimum value: 2
#'   - Maximum value: Length of `cols2bind`
#'   - Controls column pair complexity
#'   - Default is 2
#'
#' @param sep `character` separator for pair naming
#'   - Used in generating combination identifiers
#'   - Must be a single character
#'   - Default is "-"
#'
#' @param nest_type Output nesting format
#'   - `"dt"`: Returns nested `data table` (default)
#'   - `"df"`: Returns nested `data frame`
#'
#' @details
#' Advanced Transformation Mechanism:
#' \enumerate{
#'   \item Input validation and preprocessing
#'   \item Dynamic column combination generation
#'   \item Flexible pair transformation
#'   \item Nested data structure creation
#' }
#'
#' Transformation Process:
#' \itemize{
#'   \item Validate input parameters and column specifications
#'   \item Convert numeric indices to column names if necessary
#'   \item Generate column combinations
#'   \item Create subset data tables
#'   \item Merge and nest transformed data
#' }
#'
#' Column Specification:
#' \itemize{
#'   \item Supports both column names and numeric indices
#'   \item Numeric indices must be within valid range (1 to ncol)
#'   \item Column names must exist in the dataset
#'   \item Flexible specification for both cols2bind and by parameters
#' }
#'
#' @return `data table` containing nested transformation results
#'   - Includes `pairs` column identifying column combinations
#'   - Contains `data` column storing nested data structures
#'   - Supports optional grouping variables
#'
#' @note Key Operation Constraints:
#' \itemize{
#'   \item Requires non-empty input data
#'   \item Column specifications must be valid (either names or indices)
#'   \item Supports flexible combination strategies
#'   \item Computational complexity increases with combination size
#' }
#'
#' @seealso
#' \itemize{
#'   \item [`utils::combn()`] Combination generation
#' }
#'
#' @import data.table
#' @importFrom utils combn
#' @export
#' @examples
#' # Example data preparation: Define column names for combination
#' col_names <- c("Sepal.Length", "Sepal.Width", "Petal.Length")
#'
#' # Example 1: Basic column-to-pairs nesting with custom separator
#' c2p_nest(
#'   iris,                   # Input iris dataset
#'   cols2bind = col_names,  # Columns to be combined as pairs
#'   pairs_n = 2,            # Create pairs of 2 columns
#'   sep = "&"               # Custom separator for pair names
#' )
#' # Returns a nested data.table where:
#' # - pairs: combined column names (e.g., "Sepal.Length&Sepal.Width")
#' # - data: list column containing data.tables with value1, value2 columns
#'
#' # Example 2: Column-to-pairs nesting with numeric indices and grouping
#' c2p_nest(
#'   iris,                   # Input iris dataset
#'   cols2bind = 1:3,        # First 3 columns to be combined
#'   pairs_n = 2,            # Create pairs of 2 columns
#'   by = 5                  # Group by 5th column (Species)
#' )
#' # Returns a nested data.table where:
#' # - pairs: combined column names
#' # - Species: grouping variable
#' # - data: list column containing data.tables grouped by Species

c2p_nest <- function(data, cols2bind, by = NULL, pairs_n = 2, sep = "-", nest_type = "dt") {
  . <- pairs <- NULL  # For data.table's NSE
  
  # Validate inputs
  if (!inherits(data, c("data.table", "data.frame"))) {
    stop("data must be a data.table or a data.frame")
  }
  data <- data.table::as.data.table(data)
  
  # Handle numeric indices for cols2bind
  if (is.numeric(cols2bind)) {
    if (any(cols2bind > ncol(data) | cols2bind < 1)) {
      stop("Invalid column indices in cols2bind")
    }
    cols2bind <- names(data)[cols2bind]
  }
  
  if (!is.character(cols2bind)) {
    stop("cols2bind must be either a character vector or numeric vector")
  }
  missing_cols <- cols2bind[!cols2bind %in% names(data)]
  if (length(missing_cols) > 0) {
    stop("Some columns specified in cols2bind are not present in the data: ", paste(missing_cols, collapse=", "))
  }
  
  # Handle numeric indices for by parameter
  if (!is.null(by)) {
    if (is.numeric(by)) {
      if (any(by > ncol(data) | by < 1)) {
        stop("Invalid column indices in by")
      }
      by <- names(data)[by]
    }
    if (!is.character(by)) {
      stop("'by' must be either a character vector or numeric vector of column indices")
    }
    missing_by_vars <- by[!by %in% names(data)]
    if (length(missing_by_vars) > 0) {
      stop("Grouping variables not present in data: ", paste(missing_by_vars, collapse=", "))
    }
  }
  
  # Validate pairs_n
  if (!is.numeric(pairs_n) || pairs_n < 2 || floor(pairs_n) != pairs_n) {
    stop("pairs_n must be a positive integer greater than or equal to 2")
  }
  
  # Check if pairs_n is less than or equal to the number of available columns
  if (pairs_n > length(cols2bind)) {
    stop(sprintf("pairs_n (%d) cannot be larger than the number of available columns (%d)", 
                 pairs_n, length(cols2bind)))
  }
  
  if (!is.character(sep) || length(sep) != 1) {
    stop("sep must be a single character string")
  }
  
  if (!nest_type %in% c("dt", "df")) {
    stop("Invalid nest_type provided. It must be either 'dt' or 'df'.")
  }
  
  # Prepare data for combination operations
  dt <- data.table::copy(data)
  fixed_cols <- setdiff(names(dt), cols2bind)
  comb_cols_list <- combn(cols2bind, pairs_n, simplify=FALSE)
  
  list_of_dts <- lapply(comb_cols_list, function(comb) {
    dt_subset <- dt[, c(fixed_cols, comb), with=FALSE]
    pairs_name <- paste(comb, collapse=sep)
    data.table::setnames(dt_subset, comb, paste0('value', seq_along(comb)))
    dt_subset[, pairs := pairs_name]
    dt_subset
  })
  
  dt_bind <- data.table::rbindlist(list_of_dts)
  
  # Determine grouping variables
  if (!is.null(by) && length(by) > 0) {
    groupby <- c("pairs", by)
  } else {
    groupby <- "pairs"
  }
  
  # Nest the data based on nest_type
  if (nest_type == "dt") {
    result <- dt_bind[, .(data = list(.SD)), by = groupby]
  } else if (nest_type == "df") {
    result <- dt_bind[, .(data = list(as.data.frame(.SD))), by = groupby]
  }
  
  return(result)
}