R/import_xlsx.R
In mintyr: Streamlined Data Processing Tools for Genomic Selection

Documented in import_xlsx

# WARNING - Generated by {fusen} from dev/flat_teaching.Rmd: do not edit by hand

#' Import Data from `XLSX` Files with Advanced Handling
#'
#' @description
#' A robust and flexible function for importing data from one or multiple 
#' `XLSX` files, offering comprehensive options for sheet selection, 
#' data combination, and source tracking.
#'
#' @param file A `character` vector of file paths to `Excel` files. 
#'   Must point to existing `.xlsx` or `.xls` files.
#' @param rbind A `logical` value controlling data combination strategy:
#'   - `TRUE`: Combines all data into a single `data.table`
#'   - `FALSE`: Returns a list of `data.table`s
#'   Default is `TRUE`.
#' @param sheet A `numeric` vector or `NULL` specifying sheet import strategy:
#'   - `NULL` (default): Imports all sheets
#'   - `numeric`: Imports only specified sheet indices
#' @param ... Additional arguments passed to [`readxl::read_excel()`], 
#'   such as `col_types`, `skip`, or `na`.
#'
#' @details
#' The function provides a comprehensive solution for importing Excel data with the
#' following features:
#' \itemize{
#'   \item Supports multiple files and sheets
#'   \item Automatic source tracking for files and sheets
#'   \item Flexible combining options
#'   \item Handles missing columns across sheets when combining
#'   \item Preserves original data types through readxl
#' }
#'
#' @return 
#' Depends on the `rbind` parameter:
#' \itemize{
#'   \item If `rbind = TRUE`: A single `data.table` with additional tracking columns:
#'     - `excel_name`: Source file name (without extension)
#'     - `sheet_name`: Source sheet name
#'   \item If `rbind = FALSE`: A named list of `data.table`s with format 
#'     `"filename_sheetname"`
#' }
#'
#' @note
#' Critical Import Considerations:
#' \itemize{
#'   \item Requires all specified files to be accessible `Excel` files
#'   \item Sheet indices must be valid across input files
#'   \item `rbind = TRUE` assumes compatible data structures
#'   \item Missing columns are automatically filled with `NA`
#'   \item File extensions are automatically removed in tracking columns
#' }
#'
#' @seealso
#' \itemize{
#'   \item [`readxl::read_excel()`] for underlying Excel reading
#'   \item [`data.table::rbindlist()`] for data combination
#' }
#'
#' @importFrom data.table ":="
#' @importFrom readxl read_excel excel_sheets
#'
#' @export
#' @examples
#' # Example: Excel file import demonstrations
#'
#' # Setup test files
#' xlsx_files <- mintyr_example(
#'   mintyr_examples("xlsx_test")    # Get example Excel files
#' )
#'
#' # Example 1: Import and combine all sheets from all files
#' import_xlsx(
#'   xlsx_files,                     # Input Excel file paths
#'   rbind = TRUE                    # Combine all sheets into one data.table
#' )
#'
#' # Example 2: Import specific sheets separately
#' import_xlsx(
#'   xlsx_files,                     # Input Excel file paths
#'   rbind = FALSE,                  # Keep sheets as separate data.tables
#'   sheet = 2                       # Only import first sheet
#' )
import_xlsx <- function(file, rbind = TRUE, sheet = NULL, ...) {
  excel_name <- NULL
  # Parameter checks
  if (!is.character(file) || !all(file.exists(file))) {
    stop("file must be a vector of existing file paths.")
  }

  if (!is.logical(rbind)) {
    stop("Parameter 'rbind' should be logical (TRUE or FALSE).")
  }

  # Function to remove file extension
  remove_extension <- function(filename) {
    sub("\\.[^.]*$", "", basename(filename))
  }

  # Reads selected sheets from a single Excel file and converts them into a data.table
  read_selected_sheets <- function(file_path, merge, sheet_indices, ...) {
    all_sheets <- readxl::excel_sheets(file_path)
    # Validate sheet indices
    if (!is.null(sheet_indices)) {
      if (is.numeric(sheet_indices)) {
        if (any(sheet_indices > length(all_sheets)) || any(sheet_indices < 1)) {
          stop("sheet index out of range for file: ", file_path)
        }
      } else {
        stop("sheet parameter must be a numeric vector or NULL.")
      }
    }

    selected_sheets <- if (is.null(sheet_indices)) all_sheets else all_sheets[sheet_indices]

    sheet_data <- lapply(selected_sheets, function(s) {
      dt <- data.table::as.data.table(readxl::read_excel(file_path, sheet = s, ...))
      if (!merge) {
        return(list(data = dt))  # Return each sheet as an independent list item if not merging
      } else {
        return(dt)
      }
    })

    if (merge) {
      names(sheet_data) <- selected_sheets
      data.table::rbindlist(sheet_data, use.names = TRUE, fill = TRUE, idcol = "sheet_name")
    } else {
      names(sheet_data) <- selected_sheets
      return(sheet_data)
    }
  }

  # Finding minimum sheet count across all Excel files
  min_sheet_count <- min(sapply(file, function(f) length(readxl::excel_sheets(f))))

  # Sheet parameter validation
  if (!is.null(sheet)) {
    if (is.numeric(sheet) && (max(sheet) > min_sheet_count || min(sheet) < 1)) {
      stop("sheet parameter contains indices out of range across files.")
    }
  }

  # Applies the modified function across all files
  all_data <- lapply(file, read_selected_sheets, merge = rbind, sheet_indices = sheet, ...)

  if (rbind) {
    # If merging, use rbindlist to combine all files' data into one data.table
    combined_data <- data.table::rbindlist(all_data, use.names = TRUE, fill = TRUE, idcol = "excel_name")
    xlsx_sheets_names <- sapply(file, remove_extension)  # 使用remove_extension替代tools::file_path_sans_ext
    # Set 'excel_name' column's value to the corresponding file names
    combined_data[, excel_name := rep(xlsx_sheets_names, sapply(all_data, nrow))][]
    return(combined_data)
  } else {
    # If not merging, create a new list to store all sheets' data
    result_list <- list()
    xlsx_sheets_names <- sapply(file, remove_extension)  # 使用remove_extension替代tools::file_path_sans_ext
    for (i in seq_along(file)) {
      file_name <- xlsx_sheets_names[i]
      file_data <- all_data[[i]]
      # For each file's sheets, set list item names as "file_name_sheet_name"
      for (sheet_name in names(file_data)) {
        list_name <- paste(file_name, sheet_name, sep = "_")
        result_list[[list_name]] <- file_data[[sheet_name]][["data"]]
      }
    }
    return(result_list)
  }
}