R/w2l_nest.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in w2l_nest

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Reshape Wide Data to Long Format and Nest by Specified Columns
#'
#' @description
#' The `w2l_nest` function reshapes wide-format data into long-format and nests it by specified columns.
#' It handles both `data.frame` and `data.table` objects and provides options for grouping and nesting the data.
#'
#' @param data `data.frame` or `data.table`
#'   - Input dataset in wide format
#'   - Automatically converted to `data.table` if necessary
#'
#' @param cols2l `numeric` or `character` columns to transform
#'   - Specifies columns for wide-to-long conversion
#'   - Can be column indices or column names
#'   - Default is `NULL`
#'
#' @param by `numeric` or `character` grouping variables
#'   - Optional columns for additional data stratification
#'   - Can be column indices or column names
#'   - Used to create hierarchical nested structures
#'   - Default is `NULL`
#'
#' @param nest_type `character` output data type
#'   - Defines nested data object type
#'   - Possible values:
#'     - `"dt"`: nested `data.table`
#'     - `"df"`: nested `data.frame`
#'   - Default is `"dt"`
#'
#' @details
#' The function melts the specified wide columns into long format and nests the resulting data by the `name`
#' column and any additional grouping variables specified in `by`. The nested data can be in the form of
#' `data.table` or `data.frame` objects, controlled by the `nest_type` parameter.
#'
#' Both `cols2l` and `by` parameters accept either column indices or column names, providing flexible ways
#' to specify the columns for transformation and grouping.
#'
#' @return `data.table` with nested data in long format, grouped by specified columns if provided. Each row contains a nested `data.table` or `data.frame` under the column data, depending on nest_type.
#' \itemize{
#'   \item If `by` is `NULL`, returns a `data.table` nested by `name`.
#'   \item If `by` is specified, returns a `data.table` nested by `name` and the grouping variables.
#' }
#'
#' @note
#' \itemize{
#'   \item Both `cols2l` and `by` parameters can be specified using either numeric indices or character column names.
#'   \item When using numeric indices, they must be valid column positions in the data (1 to ncol(data)).
#'   \item When using character names, all specified columns must exist in the data.
#'   \item The function converts `data.frame` to `data.table` if necessary.
#'   \item The `nest_type` parameter controls whether nested data are `data.table` (`"dt"`) or `data.frame` (`"df"`) objects.
#'   \item If `nest_type` is not `"dt"` or `"df"`, the function will stop with an error.
#' }
#'
#' @seealso
#' Related functions and packages:
#' \itemize{
#'   \item [`tidytable::nest_by()`] Nest data.tables by group
#' }
#'
#' @import data.table
#' @export
#' @examples
#' # Example: Wide to long format nesting demonstrations
#'
#' # Example 1: Basic nesting by group
#' w2l_nest(
#'   data = iris,                    # Input dataset
#'   by = "Species"                  # Group by Species column
#' )
#'
#' # Example 2: Nest specific columns with numeric indices
#' w2l_nest(
#'   data = iris,                    # Input dataset
#'   cols2l = 1:4,                   # Select first 4 columns to nest
#'   by = "Species"                  # Group by Species column
#' )
#'
#' # Example 3: Nest specific columns with column names
#' w2l_nest(
#'   data = iris,                    # Input dataset
#'   cols2l = c("Sepal.Length",      # Select columns by name
#'              "Sepal.Width", 
#'              "Petal.Length"),
#'   by = 5                          # Group by column index 5 (Species)
#' )
#' # Returns similar structure to Example 2
w2l_nest <- function(data, cols2l = NULL, by = NULL, nest_type = "dt") {
  . <- name <- NULL

  # Ensure the data is a data.table object
  if (!data.table::is.data.table(data)) {
    if (is.data.frame(data)) {
      data <- data.table::as.data.table(data)  # Convert data.frame to data.table if necessary
    } else {
      stop("Data must be either a data.frame or a data.table.")  # Stop if data is not a data.table or data.frame
    }
  }

  # Process grouping variables
  if (!is.null(by)) {
    # Convert numeric indices to column names if necessary
    if (is.numeric(by)) {
      if (any(by < 1 | by > ncol(data))) {
        stop("Numeric indices in by are out of bounds.")
      }
      by <- names(data)[by]
    } else if (is.character(by)) {
      missing_by_vars <- by[!by %in% names(data)]
      if (length(missing_by_vars) > 0) {
        stop("Grouping variables not present in data: ", paste(missing_by_vars, collapse=", "))
      }
    } else {
      stop("by should be either numeric indices or character vector of column names.")
    }
  }

  # Handle case when cols2l is NULL
  if (is.null(cols2l)) {
    if (is.null(by)) {
      stop("When cols2l is NULL, by parameter must be provided.")
    }
    # Directly nest the data by grouping variables
    if (nest_type == "dt") {
      result <- data[, .(data = list(.SD)), by = by]
    } else if (nest_type == "df") {
      result <- data[, .(data = list(as.data.frame(.SD))), by = by]
    } else {
      stop("Invalid nest_type provided. It must be either 'dt' or 'df'.")
    }
  } else {
    # Check the validity of cols2l based on its type
    if (is.numeric(cols2l)) {
      if (any(cols2l < 1 | cols2l > ncol(data))) {
        stop("Numeric indices in cols2l are out of bounds.")
      }
      cols2l <- names(data)[cols2l]
    } else if (is.character(cols2l)) {
      if (!all(cols2l %in% names(data))) {
        missing_cols <- cols2l[!cols2l %in% names(data)]
        stop("Some columns specified in cols2l are not present in the data: ", paste(missing_cols, collapse=", "))
      }
    } else {
      stop("cols2l should be either numeric indices or character vector of column names.")
    }

    # Melt the data
    melted_data <- data.table::melt(
      data,
      measure.vars = cols2l,
      variable.name = "name",
      value.name = "value"
    )

    # Determine grouping variables for nesting
    if (!is.null(by) && length(by) > 0) {
      groupby <- c("name", by)
    } else {
      groupby <- "name"
    }

    # Nest the data based on nest_type
    if (nest_type == "dt") {
      result <- melted_data[, .(data = list(.SD)), by = groupby]
    } else if (nest_type == "df") {
      result <- melted_data[, .(data = list(as.data.frame(.SD))), by = groupby]
    } else {
      stop("Invalid nest_type provided. It must be either 'dt' or 'df'.")
    }
  }

  # Return the result
  return(result)
}