R/export_nest.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in export_nest

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Export Nested Data with Advanced Grouping and Flexible Handling
#'
#' @description
#' The `export_list` function exports nested data from a `data.frame` or `data.table` with sophisticated grouping 
#' capabilities, supporting multiple nested column types and flexible file export options.
#'
#' @param nest_dt A `data.frame` or `data.table` containing nested columns of `data.frame`s, 
#'   `data.table`s, or lists to be exported.
#' @param group_cols Optional character vector specifying grouping columns. 
#'   If `NULL`, uses all non-nested columns as grouping variables.
#' @param nest_col Optional character string indicating the nested column to export. 
#'   If `NULL`, automatically selects the first nested column.
#' @param export_path Base directory path for file export. Defaults to a temporary directory 
#'   created by `tempdir()`.
#' @param file_type File export format, either `"txt"` (tab-separated) or `"csv"`. 
#'   Defaults to `"txt"`.
#'
#' @details
#' Comprehensive Nested Data Export Features:
#' \itemize{
#'   \item Automatic detection and handling of different nested column types
#'   \item Flexible grouping strategies with intelligent column selection
#'   \item Hierarchical directory structure generation based on grouping columns
#'   \item Support for mixed nested column types (`data.frame`, `data.table`, `list`)
#'   \item Multi-threaded file writing for enhanced performance
#'   \item Informative messaging and warning system
#' }
#'
#' Nested Column Detection Hierarchy:
#' \enumerate{
#'   \item Prioritizes `data.frame`/`data.table` nested columns
#'   \item Falls back to regular `list` columns if no `data.frame` columns exist
#' }
#'
#' Grouping Column Selection Strategy:
#' \enumerate{
#'   \item When `group_cols` is `NULL`, uses all non-nested columns
#'   \item Provides warnings about unused non-nested columns
#'   \item Validates provided group columns
#' }
#'
#' File Export Characteristics:
#' \itemize{
#'   \item Supports `"txt"` (tab-separated) and `"csv"` formats
#'   \item Uses multi-threading via `parallel::detectCores()`
#'   \item Creates nested directory structure based on grouping variables
#' }
#'
#' @return 
#' An `integer` representing the total number of files exported successfully.
#'
#' @note
#' Key Capabilities:
#' \itemize{
#'   \item Handles complex nested data structures
#'   \item Performs type conversion for nested content
#'   \item Utilizes multi-threaded file export for optimal performance
#'   \item Provides comprehensive column selection feedback
#' }
#'
#' @importFrom data.table as.data.table fwrite
#' @importFrom parallel detectCores
#' @export
#' @examples
#' # Example 1: Basic nested data export workflow
#' # Step 1: Create nested data structure
#' dt_nest <- w2l_nest(
#'   data = iris,              # Input iris dataset
#'   cols2l = 1:2,             # Columns to be nested
#'   by = "Species"            # Grouping variable
#' )
#'
#' # Step 2: Export nested data to files
#' export_nest(
#'   nest_dt = dt_nest,        # Input nested data.table
#'   nest_col = "data",        # Column containing nested data
#'   group_cols = c("name", "Species")  # Columns to create directory structure
#' )
#' # Returns the number of files created
#' # Creates directory structure: tempdir()/name/Species/data.txt
#'
#' # Check exported files
#' list.files(
#'   path = tempdir(),         # Default export directory
#'   pattern = "txt",          # File type pattern to search
#'   recursive = TRUE          # Search in subdirectories
#' )
#' # Returns list of created files and their paths
#'
#' # Clean up exported files
#' files <- list.files(
#'   path = tempdir(),         # Default export directory
#'   pattern = "txt",          # File type pattern to search
#'   recursive = TRUE,         # Search in subdirectories
#'   full.names = TRUE         # Return full file paths
#' )
#' file.remove(files)          # Remove all exported files
export_nest <- function(nest_dt, group_cols = NULL, nest_col = NULL,
                        export_path = tempdir(), file_type = "txt") {
  # Basic input validation
  if (nrow(nest_dt) == 0) {
    stop("The input nest_dt cannot be empty")
  }

  # Check and get nested columns
  # 1. Check for data.frame/data.table nested columns
  df_nested_cols <- names(nest_dt)[sapply(nest_dt, function(x) {
    is.list(x) && all(sapply(x, function(y) {
      inherits(y, c("data.frame", "data.table"))
    }))
  })]

  # 2. Check for regular list columns
  list_cols <- names(nest_dt)[vapply(nest_dt, is.list, logical(1))]

  # Combine both types of nested columns
  nested_cols <- unique(c(df_nested_cols, list_cols))

  if (length(nested_cols) == 0) {
    stop("The input nest_dt must contain at least one nested column")
  }

  # If nest_col is NULL, prioritize using data.frame/data.table nested columns
  if (is.null(nest_col)) {
    if (length(df_nested_cols) > 0) {
      nest_col <- df_nested_cols[1]
      message("Using first nested data.frame/data.table column: ", nest_col)
    } else {
      nest_col <- list_cols[1]
      message("Using first list column: ", nest_col)
    }
  } else if (!nest_col %in% nested_cols) {
    stop("Specified nest_col is not a valid nested column")
  }

  # If group_cols is NULL, use all non-nested columns
  if (is.null(group_cols)) {
    group_cols <- setdiff(names(nest_dt), nested_cols)
    message("Using all non-nested columns as groups: ", paste(group_cols, collapse = ", "))
  } else {
    # Validate user-provided group columns
    if (!is.character(group_cols)) {
      stop("group_cols must be a character vector")
    }
    missing_cols <- setdiff(group_cols, names(nest_dt))
    if (length(missing_cols) > 0) {
      stop("The following group columns are missing: ", paste(missing_cols, collapse = ", "))
    }

    # Check if all non-nested columns are used as group columns
    all_non_nested_cols <- setdiff(names(nest_dt), nested_cols)
    unused_cols <- setdiff(all_non_nested_cols, group_cols)
    if (length(unused_cols) > 0) {
      warning("Not all non-nested columns are used as group columns. ",
              "The exported data may be incomplete without the following columns: ",
              paste(unused_cols, collapse = ", "))
    }
  }

  # Parameter validation
  file_type <- tolower(file_type)
  if (!(file_type %in% c("txt", "csv"))) {
    stop("file_type must be either 'txt' or 'csv'")
  }

  if (!is.character(export_path) || length(export_path) != 1) {
    stop("export_path must be a single character string")
  }

  # Create export directory
  dir.create(export_path, showWarnings = FALSE, recursive = TRUE)

  # Export processing
  tryCatch({
    # Process and expand nested data
    expanded_dt <- nest_dt[, {
      processed_nests <- lapply(get(nest_col), function(x) {
        if (inherits(x, c("data.frame", "data.table"))) {
          x_dt <- if (!is.data.table(x)) as.data.table(x) else x
          setattr(x_dt, "row.names", .set_row_names(nrow(x_dt)))
          return(x_dt)
        } else {
          # Try to convert non-data.frame/data.table to data.table
          tryCatch({
            x_dt <- as.data.table(as.list(x))
            setattr(x_dt, "row.names", .set_row_names(nrow(x_dt)))
            return(x_dt)
          }, error = function(e) {
            stop(sprintf("Cannot convert nested content to data.table: %s", e$message))
          })
        }
      })
      data.table::rbindlist(processed_nests, fill = TRUE)
    }, by = group_cols]

    expanded_dt <- copy(expanded_dt)

    # Create required subdirectories
    unique_paths <- unique(expanded_dt[, do.call(file.path,
                                                 c(list(export_path), lapply(group_cols, function(col) get(col))))])
    lapply(unique_paths, dir.create, showWarnings = FALSE, recursive = TRUE)

    # Export files
    file_count <- 0L
    expanded_dt[, {
      dir_path <- do.call(file.path, c(list(export_path),
                                       lapply(group_cols, function(col) get(col))))
      sep <- if (file_type == "txt") "\t" else ","
      filename <- paste0(nest_col, ".", file_type)
      fwrite(.SD, file = file.path(dir_path, filename),
             sep = sep, nThread = parallel::detectCores() - 1, buffMB = 32)
      file_count <<- file_count + 1L
      NULL
    }, by = group_cols]

    return(file_count)
  }, error = function(e) {
    stop("Failed to export nested data: ", e$message)
  })
}