R/w2l_split.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in w2l_split

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Reshape Wide Data to Long Format and Split into List
#'
#' @description
#' The `w2l_split` function reshapes wide-format data into long-format and splits it into a list
#' by variable names and optional grouping columns. It handles both `data.frame` and `data.table` objects.
#'
#' @param data `data.frame` or `data.table`
#'   - Input dataset in wide format
#'   - Automatically converted to `data.table` if necessary
#'
#' @param cols2l `numeric` or `character` columns to transform
#'   - Specifies columns for wide-to-long conversion
#'   - Can be column indices or column names
#'   - Default is `NULL`
#'
#' @param by `numeric` or `character` grouping variables
#'   - Optional columns for data splitting
#'   - Can be column indices or column names
#'   - Used to create hierarchical split structure
#'   - Default is `NULL`
#'
#' @param split_type `character` output data type
#'   - Defines split data object type
#'   - Possible values:
#'     - `"dt"`: split `data.table` objects
#'     - `"df"`: split `data.frame` objects
#'   - Default is `"dt"`
#'
#' @param sep `character` separator
#'   - Used for combining split names
#'   - Default is `"_"`
#'
#' @details
#' The function melts the specified wide columns into long format and splits the resulting data
#' into a list based on the variable names and any additional grouping variables specified in `by`.
#' The split data can be in the form of `data.table` or `data.frame` objects, controlled by the
#' `split_type` parameter.
#'
#' Both `cols2l` and `by` parameters accept either column indices or column names, providing flexible ways
#' to specify the columns for transformation and splitting.
#'
#' @return A list of `data.table` or `data.frame` objects (depending on `split_type`), split by variable
#' names and optional grouping columns.
#' \itemize{
#'   \item If `by` is `NULL`, returns a list split by variable names only.
#'   \item If `by` is specified, returns a list split by both variable names and grouping variables.
#' }
#'
#' @note
#' \itemize{
#'   \item Both `cols2l` and `by` parameters can be specified using either numeric indices or character column names.
#'   \item When using numeric indices, they must be valid column positions in the data (1 to ncol(data)).
#'   \item When using character names, all specified columns must exist in the data.
#'   \item The function converts `data.frame` to `data.table` if necessary.
#'   \item The `split_type` parameter controls whether split data are `data.table` (`"dt"`) or `data.frame` (`"df"`) objects.
#'   \item If `split_type` is not `"dt"` or `"df"`, the function will stop with an error.
#' }
#'
#' @seealso
#' Related functions and packages:
#' \itemize{
#'   \item [`tidytable::group_split()`] Split data frame by groups
#' }
#'
#' @import data.table
#' @export
#' @examples
#' # Example: Wide to long format splitting demonstrations
#'
#' # Example 1: Basic splitting by Species
#' w2l_split(
#'   data = iris,                    # Input dataset
#'   by = "Species"                  # Split by Species column
#' ) |> 
#'   lapply(head)                    # Show first 6 rows of each split
#'
#' # Example 2: Split specific columns using numeric indices
#' w2l_split(
#'   data = iris,                    # Input dataset
#'   cols2l = 1:3,                   # Select first 3 columns to split
#'   by = 5                          # Split by column index 5 (Species)
#' ) |> 
#'   lapply(head)                    # Show first 6 rows of each split
#'
#' # Example 3: Split specific columns using column names
#' list_res <- w2l_split(
#'   data = iris,                    # Input dataset
#'   cols2l = c("Sepal.Length",      # Select columns by name
#'              "Sepal.Width"),
#'   by = "Species"                  # Split by Species column
#' )
#' lapply(list_res, head)            # Show first 6 rows of each split
#' # Returns similar structure to Example 2

w2l_split <- function(data, cols2l = NULL, by = NULL, split_type = "dt", sep = "_") {
  # Check if input data is data.table, if not convert it
  if (!data.table::is.data.table(data)) {
    if (is.data.frame(data)) {
      data <- data.table::as.data.table(data)
    } else {
      stop("data must be a data.frame or data.table.")
    }
  }
  
  # Process by parameter - handle both numeric and character input
  if (!is.null(by)) {
    if (is.numeric(by)) {
      if (any(by < 1 | by > ncol(data))) {
        stop("Numeric indices in by are out of bounds.")
      }
      by <- names(data)[by]
    } else if (is.character(by)) {
      if (!all(by %in% names(data))) {
        missing_by <- by[!by %in% names(data)]
        stop("Some 'by' columns are not present in the data: ",
             paste(missing_by, collapse = ", "))
      }
    } else {
      stop("by should be either numeric indices or character vector of column names.")
    }
  }
  
  # Handle case when cols2l is NULL
  if (is.null(cols2l)) {
    if (is.null(by)) {
      stop("When cols2l is NULL, by parameter must be provided.")
    }
    # Directly split the data by grouping variables
    dt_list <- split(data, by = by, keep.by = F, drop = TRUE)
    
    # Create list names using by variables
    split_values <- do.call(paste, c(lapply(by, function(x) data[[x]]), list(sep = sep)))
    split_values <- unique(split_values)
    names(dt_list) <- split_values
  } else {
    # Process cols2l parameter - handle both numeric and character input
    if (is.numeric(cols2l)) {
      if (any(cols2l < 1 | cols2l > ncol(data))) {
        stop("Numeric indices in cols2l are out of bounds.")
      }
      cols2l_names <- names(data)[cols2l]
    } else if (is.character(cols2l)) {
      if (!all(cols2l %in% names(data))) {
        missing_cols <- cols2l[!cols2l %in% names(data)]
        stop("Some columns specified in cols2l are not present in the data: ",
             paste(missing_cols, collapse = ", "))
      }
      cols2l_names <- cols2l
    } else {
      stop("cols2l should be either numeric indices or character vector of column names.")
    }
    
    # Identify ID variables (all columns except those to be transformed)
    id_vars <- setdiff(names(data), cols2l_names)
    if (!is.null(by)) {
      id_vars <- unique(c(id_vars, by))
    }
    
    # Melt data from wide to long format
    dt_long <- data.table::melt(data, id.vars = id_vars, measure.vars = cols2l_names,
                                variable.name = "variable", value.name = "value")
    
    # Define splitting variables and split the data
    split_vars <- c("variable", by)
    dt_list <- split(dt_long, by = split_vars, keep.by = F, drop = TRUE)
    
    # Create list names using by variables if provided
    if (!is.null(by)) {
      # Combine split variables values using specified separator
      split_values <- do.call(paste, c(lapply(split_vars, function(x) dt_long[[x]]), list(sep = sep)))
      split_values <- unique(split_values)
      names(dt_list) <- split_values
    }
  }
  
  # Convert to specified output format
  if (split_type == "dt") {
    # Keep as data.table
  } else if (split_type == "df") {
    # Convert to data.frame
    dt_list <- lapply(dt_list, as.data.frame)
  } else {
    stop("Invalid split_type provided. It must be either 'dt' or 'df'.")
  }
  
  return(dt_list)
}