R/r2p_nest.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in r2p_nest

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Row to Pair Nested Transformation
#'
#' @description
#' A sophisticated data transformation tool for performing row pair conversion 
#' and creating nested data structures with advanced configuration options.
#'
#' @param data Input `data frame` or `data table`
#'   - Must contain valid columns for transformation
#'   - Supports multiple data types
#'
#' @param rows2bind Row binding specification
#'   - Can be a `character` column name
#'   - Can be a `numeric` column index
#'   - Must be a single column identifier
#'
#' @param by Grouping specification for nested pairing
#'   - Can be a `character` vector of column names
#'   - Can be a `numeric` vector of column indices
#'   - Must specify at least one column
#'   - Supports multi-column transformation
#'
#' @param nest_type Output nesting format
#'   - `"dt"`: Returns nested `data table` (default)
#'   - `"df"`: Returns nested `data frame`
#'
#' @details
#' Advanced Transformation Mechanism:
#' \enumerate{
#'   \item Input validation and preprocessing
#'   \item Dynamic column identification
#'   \item Flexible row pairing across specified columns
#'   \item Nested data structure generation
#' }
#'
#' Transformation Process:
#' \itemize{
#'   \item Validate input parameters and column specifications
#'   \item Convert numeric indices to column names if necessary
#'   \item Reshape data from wide to long format
#'   \item Perform column-wise nested transformation
#'   \item Generate final nested structure
#' }
#'
#' Column Specification:
#' \itemize{
#'   \item Supports both column names and numeric indices
#'   \item Numeric indices must be within valid range (1 to ncol)
#'   \item Column names must exist in the dataset
#'   \item Flexible specification for both rows2bind and by parameters
#' }
#'
#' @return `data table` containing nested transformation results
#'   - Includes `name` column identifying source columns
#'   - Contains `data` column storing nested data structures
#'
#' @note Key Operation Constraints:
#' \itemize{
#'   \item Requires non-empty input data
#'   \item Column specifications must be valid (either names or indices)
#'   \item By parameter must specify at least one column
#'   \item Low computational overhead
#' }
#'
#' @seealso
#' \itemize{
#'   \item [`data.table::melt()`] Long format conversion
#'   \item [`data.table::dcast()`] Wide format conversion
#'   \item [`base::rbind()`] Row binding utility
#'   \item [`c2p_nest()`] Column to pair nested transformation
#' }
#'
#' @import data.table
#' @importFrom stats as.formula
#' @export
#' @examples
#' # Example 1: Row-to-pairs nesting with column names
#' r2p_nest(
#'   mtcars,                     # Input mtcars dataset
#'   rows2bind = "cyl",          # Column to be used as row values
#'   by = c("hp", "drat", "wt")  # Columns to be transformed into pairs
#' )
#' # Returns a nested data.table where:
#' # - name: variable names (hp, drat, wt)
#' # - data: list column containing data.tables with rows grouped by cyl values
#'
#' # Example 2: Row-to-pairs nesting with numeric indices
#' r2p_nest(
#'   mtcars,                     # Input mtcars dataset
#'   rows2bind = 2,              # Use 2nd column (cyl) as row values
#'   by = 4:6                    # Use columns 4-6 (hp, drat, wt) for pairs
#' )
#' # Returns a nested data.table where:
#' # - name: variable names from columns 4-6
#' # - data: list column containing data.tables with rows grouped by cyl values

r2p_nest <- function(data, rows2bind, by, nest_type = "dt") {
  # Input validation
  if (length(by) < 1) {
    stop("At least one column must be specified in 'by'")
  }
  if (length(rows2bind) != 1) {
    stop("rows2bind must be a single column")
  }

  # Convert data to data.table first
  data <- as.data.table(data)

  # Validate column existence and indices
  if (is.numeric(by)) {
    if (any(by > ncol(data) | by < 1)) {
      stop("Invalid column indices in by")
    }
  } else if (!all(by %in% names(data))) {
    stop("Specified by columns not found in data")
  }

  if (is.numeric(rows2bind)) {
    if (rows2bind > ncol(data) | rows2bind < 1) {
      stop("Invalid column index in rows2bind")
    }
  } else if (!rows2bind %in% names(data)) {
    stop("Specified rows2bind not found in data")
  }

  # Process each column in 'by'
  result_list <- lapply(by, function(x) {
    row_pair_init(
      data = data,
      rows2bind = rows2bind,
      by = x,
      nest_type = nest_type
    )
  })

  # Combine all results using rbindlist
  combined_result <- rbindlist(result_list)

  return(combined_result)
}
row_pair_init <- function (data, rows2bind, by, nest_type = "dt") {
  . <- NULL
  # Convert numeric indices to column names
  if (is.numeric(by)) {
    by <- names(data)[by]
  }
  if (is.numeric(rows2bind)) {
    rows2bind <- names(data)[rows2bind]
  }

  # Get ID columns (all columns except 'by' columns)
  id_cols <- setdiff(names(data), by)

  # Reshape data from wide to long format
  long_dt <- melt(data,
                  id.vars = id_cols,
                  measure.vars = by,
                  variable.name = "name",
                  value.name = "value")

  # Get columns for formula creation
  other_ids <- setdiff(id_cols, rows2bind)

  # Create formula string for dcast
  formula_str <- paste(paste(c("name", other_ids), collapse = " + "),
                       rows2bind, sep = " ~ ")

  # Reshape data from long to wide format
  wide_dt <- dcast(long_dt, as.formula(formula_str), value.var = "value")

  # Create nested output based on nest_type
  if (nest_type == "dt") {
    result <- wide_dt[, .(data = list(.SD)), by = "name"]
  }
  else if (nest_type == "df") {
    result <- wide_dt[, .(data = list(as.data.frame(.SD))),
                      by = "name"]
  }
  else {
    stop("Invalid nest_type provided. It must be either 'dt' or 'df'.")
  }

  return(result)
}