R/read.R

Defines functions wb_data wb_read read_xlsx wb_to_df convert_df

Documented in read_xlsx wb_data wb_read wb_to_df

# Internal function to convert data frame from character to whatever is required
convert_df <- function(z, types, date_conv, datetime_conv, hms_conv, as_character = FALSE, col_names = FALSE) {

  type_vals <- types[!is.na(names(types))]
  if (length(type_vals) == 0) {
    warning("could not convert. All missing in row used for variable names")
    return(z)
  }

  if (col_names) {
    # avoid scientific notation in column names
    op <- default_save_opt()
    on.exit(options(op), add = TRUE)
  }

  nums <- which(type_vals == 1)
  dtes <- which(type_vals == 2)
  poxs <- which(type_vals == 3)
  logs <- which(type_vals == 4)
  difs <- which(type_vals == 5)
  fmls <- which(type_vals == 6)

  if (as_character) {
    if (length(nums)) z[nums] <- lapply(z[nums], function(i) as.character(convert_num(i)))
    if (length(dtes)) z[dtes] <- lapply(z[dtes], function(i) as.character(date_conv(i)))
    if (length(poxs)) z[poxs] <- lapply(z[poxs], function(i) as.character(datetime_conv(i)))
    if (length(logs)) z[logs] <- lapply(z[logs], function(i) as.character(as.logical(i)))
    if (length(difs)) z[difs] <- lapply(z[difs], function(i) as.character(hms_conv(i)))
  } else {
    if (length(nums)) z[nums] <- lapply(z[nums], convert_num)
    if (length(dtes)) z[dtes] <- lapply(z[dtes], date_conv)
    if (length(poxs)) z[poxs] <- lapply(z[poxs], datetime_conv)
    if (length(logs)) z[logs] <- lapply(z[logs], as.logical)
    if (length(difs)) z[difs] <- lapply(z[difs], hms_conv)
  }

  if (length(fmls)) {
    for (i in fmls) {
      class(z[[i]]) <- c(class(z[[i]]), "formula")
    }
  }

  z
}

# `wb_to_df()` ----------------------------------------
#' Create a data frame from a Workbook
#'
#' @description
#' The `wb_to_df()` function is the primary interface for extracting data from
#' spreadsheet files into R. It interprets the underlying XML structure of a
#' worksheet to reconstruct a data frame, handling cell types, dimensions, and
#' formatting according to user specification. While `read_xlsx()` and
#' `wb_read()` are available as streamlined internal wrappers for users
#' accustomed to other spreadsheet packages, wb_to_df() serves as the
#' foundational function and provides the most comprehensive access to the
#' package's data extraction and configuration parameters.
#'
#' @details
#' The function extracts data based on a defined range or the total data extent
#' of a worksheet. If `col_names = TRUE`, the first row of the selection is
#' treated as the header; otherwise, spreadsheet column letters are used. If
#' `row_names = TRUE`, the first column of the selected range is assigned to
#' the data frame's row names.
#'
#' Dimension selection is highly flexible. The `dims` argument supports standard
#' "A1:B2" notation as well as dynamic wildcards for rows and columns. Using
#' `++` or `--` allows ranges to adapt to the spreadsheet's content. For
#' instance, `dims = "A2:C+"` reads from A2 to the last available row in
#' column C, while `dims = "A-:+9"` reads from the first populated row in
#' column A to the last column in row 9. If neither `dims` nor `named_region`
#' is provided, the function automatically calculates the range based on the
#' minimum and maximum populated cells, modified by `start_row` and `start_col`.
#'
#' Type conversion is governed by an internal guessing engine. If `detect_dates`
#' is enabled, serial dates are converted to R Date or POSIXct objects. All
#' datetimes are standardized to UTC. The function's handling of time variables
#' depends on the presence of the `hms` package; if loaded, `wb_to_df()` returns
#' `hms` variables. Otherwise, they are returned as string variables in
#' `hh:mm:ss` format. Users can provide explicit column types via the `types`
#' argument using numeric codes: 0 (character), 1 (numeric), 2 (Date), 3 (POSIXct),
#' 4 (logical), 5 (hms), and 6 (formula).
#'
#' Regarding formulas, it is important to note that `wb_to_df()` will not
#' automatically evaluate formulas added to a workbook object via
#' [wb_add_formula()]. In the underlying spreadsheet XML, only the formula
#' expression is written; the resulting value is typically generated by the
#' spreadsheet software's calculation engine when the file is opened and saved.
#' Consequently, reading a newly added formula cell without prior evaluation in
#' external software will result in an empty value unless `show_formula = TRUE`
#' is used to retrieve the formula string itself.
#'
#' If `keep_attributes` is TRUE, the data frame is returned with additional
#' metadata. This includes the internal type-guessing table (`tt`), which
#' identifies the derived type for every cell in the range, and the specific
#' `types` vector used for conversion. These attributes are useful for
#' debugging or for applications requiring precise knowledge of the
#' spreadsheet's original cell metadata.
#'
#' Specialized spreadsheet features include the ability to extract hyperlink
#' targets (`show_hyperlinks = TRUE`) instead of display text. For complex
#' layouts, `fill_merged_cells` propagates the value of a top-left merged cell
#' to all cells within the merge range. The `na` argument supports sophisticated
#' missing value definitions, accepting either a character vector or a named
#' list to differentiate between string and numeric `NA` types.
#'
#' @section Notes:
#' Recent versions of `openxlsx2` have introduced several changes to the
#' `wb_to_df()` API:
#' * Legacy arguments such as `na.strings` and `na.numbers` are no longer part
#'     of the public API and have been consolidated into the `na` argument.
#' * As of version 1.15, all datetime variables are imported with the
#'     timezone set to "UTC" to prevent system-specific local timezone shifts.
#' * The function now supports reverse-order or specific-order imports when
#'     a numeric vector is passed to the `rows` argument.
#'
#' For extensive real-world examples and advanced usage patterns, consult
#' the package vignettes—specifically "openxlsx2 read to data frame"—and
#' the dedicated chapter in the `openxlsx2` book for real-life case studies.
#'
#' @param file A workbook file path, a [wbWorkbook] object, or a valid URL.
#' @param sheet The name or index of the worksheet to read. Defaults to the first sheet.
#' @param start_row,start_col Optional numeric values specifying the first row or column
#'   to begin data discovery.
#' @param row_names Logical; if TRUE, uses the first column of the selection as row names.
#' @param col_names Logical; if TRUE, uses the first row of the selection as column headers.
#' @param skip_empty_rows,skip_empty_cols Logical; if TRUE, filters out rows or
#'   columns containing only missing values.
#' @param skip_hidden_rows,skip_hidden_cols Logical; if TRUE, excludes rows or
#'   columns marked as hidden in the worksheet metadata.
#' @param rows,cols Optional numeric vectors specifying the exact indices to read.
#' @param detect_dates Logical; if TRUE, identifies date and datetime styles for conversion.
#' @param na A character vector or a named list (e.g., `list(strings = "", numbers = -99)`)
#'   defining values to treat as `NA`.
#' @param fill_merged_cells Logical; if TRUE, propagates the top-left value of a
#'   merged range to all cells in that range.
#' @param dims A character string defining the range. Supports wildcards
#'   (e.g., "A1:++" or "A-:+5").
#' @param named_region A character string referring to a defined name or spreadsheet Table.
#' @param show_formula Logical; if TRUE, returns the formula strings instead of
#'   calculated values.
#' @param convert Logical; if TRUE, attempts to coerce columns to appropriate R classes.
#' @param types A named vector (numeric or character) to explicitly define column types.
#' @param keep_attributes Logical; if TRUE, attaches metadata such as the internal
#'   type table (tt) and types as attributes to the output.
#' @param check_names Logical; if TRUE, ensures column names are syntactically
#'   valid R names via [make.names()].
#' @param show_hyperlinks Logical; if TRUE, replaces cell values with their
#'   underlying hyperlink targets.
#' @param apply_numfmts Logical; if TRUE, applies spreadsheet number formatting
#'   and returns strings.
#' @param ... Additional arguments passed to internal methods.
#'
#' @examples
#' ###########################################################################
#' # numerics, dates, missings, bool and string
#' example_file <- system.file("extdata", "openxlsx2_example.xlsx", package = "openxlsx2")
#' wb1 <- wb_load(example_file)
#'
#' # import workbook
#' wb_to_df(wb1)
#'
#' # do not convert first row to column names
#' wb_to_df(wb1, col_names = FALSE)
#'
#' # do not try to identify dates in the data
#' wb_to_df(wb1, detect_dates = FALSE)
#'
#' # return the underlying spreadsheet formula instead of their values
#' wb_to_df(wb1, show_formula = TRUE)
#'
#' # read dimension without colNames
#' wb_to_df(wb1, dims = "A2:C5", col_names = FALSE)
#'
#' # read selected cols
#' wb_to_df(wb1, cols = c("A:B", "G"))
#'
#' # read selected rows
#' wb_to_df(wb1, rows = c(2, 4, 6))
#'
#' # convert characters to numerics and date (logical too?)
#' wb_to_df(wb1, convert = FALSE)
#'
#' # erase empty rows from dataset
#' wb_to_df(wb1, skip_empty_rows = TRUE)
#'
#' # erase empty columns from dataset
#' wb_to_df(wb1, skip_empty_cols = TRUE)
#'
#' # convert first row to rownames
#' wb_to_df(wb1, sheet = 2, dims = "C6:G9", row_names = TRUE)
#'
#' # define type of the data.frame
#' wb_to_df(wb1, cols = c(2, 5), types = c("Var1" = 0, "Var3" = 1))
#'
#' # start in row 5
#' wb_to_df(wb1, start_row = 5, col_names = FALSE)
#'
#' # na string
#' wb_to_df(wb1, na = "a")
#'
#' # read names from row two and data starting from row 4
#' wb_to_df(wb1, dims = "B2:C2,B4:C+")
#'
#' ###########################################################################
#' # Named regions
#' file_named_region <- system.file("extdata", "namedRegions3.xlsx", package = "openxlsx2")
#' wb2 <- wb_load(file_named_region)
#'
#' # read dataset with named_region (returns global first)
#' wb_to_df(wb2, named_region = "MyRange", col_names = FALSE)
#'
#' # read named_region from sheet
#' wb_to_df(wb2, named_region = "MyRange", sheet = 4, col_names = FALSE)
#'
#' # read_xlsx() and wb_read()
#' example_file <- system.file("extdata", "openxlsx2_example.xlsx", package = "openxlsx2")
#' read_xlsx(file = example_file)
#' df1 <- wb_read(file = example_file, sheet = 1)
#' df2 <- wb_read(file = example_file, sheet = 1, rows = c(1, 3, 5), cols = 1:3)
#' @export
wb_to_df <- function(
    file,
    sheet,
    start_row         = NULL,
    start_col         = NULL,
    row_names         = FALSE,
    col_names         = TRUE,
    skip_empty_rows   = FALSE,
    skip_empty_cols   = FALSE,
    skip_hidden_rows  = FALSE,
    skip_hidden_cols  = FALSE,
    rows              = NULL,
    cols              = NULL,
    detect_dates      = TRUE,
    na                = "#N/A",
    fill_merged_cells = FALSE,
    dims,
    show_formula      = FALSE,
    convert           = TRUE,
    types,
    named_region,
    keep_attributes   = FALSE,
    check_names       = FALSE,
    show_hyperlinks   = FALSE,
    apply_numfmts     = FALSE,
    ...
) {

  arguments <- c(ls(), "na.strings", "na.numbers", "xlsx_file")
  standardize_case_names(..., arguments = arguments)

  na_strings <- NULL
  na_numbers <- NA
  if (is.character(na)) {
    na_strings <- na
  }
  if (is.list(na)) {
    na_strings <- na$strings
    na_numbers <- na$numbers %||% NA
  }

  if (apply_numfmts) convert <- FALSE

  args <- list(...)
  if (any(c("na.strings", "na.numbers") %in% names(args))) {
    # # Its a little premature to activate this
    # if (getOption("openxlsx2.soon_deprecated", default = FALSE)) {
    #   msg <- paste0(
    #     "na.strings and na.numbers should be combined in a named list,",
    #     "`wb_to_df(na = list(strings = '#N/A', numbers = 999)`"
    #   )
    #   warning(msg, call. = FALSE)
    # }

    if ("na.strings" %in% names(args)) na_strings <- args[["na.strings"]]
    if ("na.numbers" %in% names(args)) na_numbers <- args[["na.numbers"]]
  }

  xlsx_file <- args$xlsx_file
  if (!is.null(xlsx_file)) {
    .Deprecated(old = "xlsx_file", new = "file", package = "openxlsx2")
    file <- xlsx_file %||% file
  }

  if (!is.null(cols)) cols <- col2int(cols)

  if (inherits(file, "wbWorkbook")) {
    wb <- file
  } else {
    # passes missing further on
    if (missing(sheet))
      sheet <- substitute()

    data_only <- TRUE
    # TODO tables and hyperlinks are deeper embedded into the wb_load code
    if (!missing(named_region) || show_hyperlinks) data_only <- FALSE

    # possible false positive on current lintr runs
    wb <- wb_load(file, sheet = sheet, data_only = data_only) # nolint
  }

  if (!missing(named_region)) {

    nr <- wb$get_named_regions(tables = TRUE)

    if ((named_region %in% nr$name) && missing(sheet)) {
      sel   <- nr[nr$name == named_region, ][1, ]
      sheet <- sel$sheet
      dims  <- sel$coords
    } else if (named_region %in% nr$name) {
      sel <- nr[nr$name == named_region & nr$sheet == wb_validate_sheet(wb, sheet), ]
      if (NROW(sel) == 0) {
        stop("no such named_region on selected sheet")
      }
      dims <- sel$coords
    } else {
      stop("no such named_region")
    }
  }

  if (missing(sheet)) {
    # TODO default sheet as 1
    sheet <- 1
  }

  if (is.factor(sheet)) {
    sheet <- as.character(sheet)
  }

  if (is.character(sheet)) {
    sheet <- wb_validate_sheet(wb, sheet)
  }

  if (is.na(sheet)) {
    stop("sheet not found. available sheets are: \n", paste0(wb$get_sheet_names(), collapse = ", "))
  }

  ws <- wb$worksheets[[sheet]]

  # the sheet has no data
  if (is.null(ws$sheet_data$cc) ||
      nrow(ws$sheet_data$cc) == 0) {
    # TODO do we need more checks or do we need to initialize a new cc object?
    message("sheet found, but contains no data")
    return(NULL)
  }

  # # Should be available, but is optional according to openxml-2.8.1. Still some
  # # third party applications are known to require it. Maybe make using
  # # dimensions an optional parameter?
  # if (missing(dims))
  #   dims <- getXML1attr_one(ws$dimension,
  #                           "dimension",
  #                           "ref")

  # If no dims are requested via named_region, simply construct them from min
  # and max columns and row found on worksheet
  # TODO it would be useful to have both named_region and dims?
  has_dims <- TRUE
  if (missing(named_region) && missing(dims)) {
    has_dims <- FALSE

    sd <- ws$sheet_data$cc[c("row_r", "c_r")]
    row <- range(as.integer(unique(sd$row_r)))
    col <- range(col2int(unique(sd$c_r)))

    if (!is.null(start_row) && as.integer(start_row) < row[1])
      row[1] <- start_row

    if (!is.null(start_col) && col2int(start_col) < col[1])
      col[1] <- start_col

    if (row[1] > row[2]) row[2] <- row[1]
    if (col[1] > col[2]) col[2] <- col[1]

    dims <- paste0(int2col(col[1]), row[1], ":",
                   int2col(col[2]), row[2])

  }

  row_attr  <- ws$sheet_data$row_attr
  cc  <- ws$sheet_data$cc
  sst <- wb$sharedStrings

  rnams <- row_attr$r
  rnams <- rnams[rnams %in% unique(cc$row_r)] # avoid blank row interference

  numfmt_date <- numfmt_is_date(wb$styles_mgr$styles$numFmts)
  xlsx_date_style <- style_is_date(wb$styles_mgr$styles$cellXfs, numfmt_date)

  # exclude if year, month or day are suspected
  numfmt_hms <- numfmt_is_hms(wb$styles_mgr$styles$numFmts)
  xlsx_hms_style <- style_is_hms(wb$styles_mgr$styles$cellXfs, numfmt_hms)

  numfmt_posix <- numfmt_is_posix(wb$styles_mgr$styles$numFmts)
  xlsx_posix_style <- style_is_posix(wb$styles_mgr$styles$cellXfs, numfmt_posix)

  # create temporary data frame. hard copy required
  z  <- dims_to_dataframe(dims, empty_rm = TRUE, cc = cc)
  tt <- create_int_dataframe(z)

  keep_cols <- colnames(z)
  keep_rows <- rownames(z)

  maxRow <- max(as.numeric(keep_rows))
  maxCol <- max(col2int(keep_cols))

  if (!is.null(start_row)) {
    keep_rows <- as.character(seq(start_row, maxRow))
    if (start_row <= maxRow) {
      sel <- rownames(z) %in% keep_rows
      z  <- z[sel, , drop = FALSE]
      tt <- tt[sel, , drop = FALSE]
    } else {
      keep_rows <- as.character(start_row)
      z  <- z[keep_rows, , drop = FALSE]
      tt <- tt[keep_rows, , drop = FALSE]

      rownames(z)  <- as.integer(keep_rows)
      rownames(tt) <- as.integer(keep_rows)
    }
  }

  if (!is.null(rows)) {
    keep_rows <- as.character(as.integer(rows))

    if (!anyNA(sel <- match(keep_rows, rownames(z)))) {
      z  <- z[sel, , drop = FALSE]
      tt <- tt[sel, , drop = FALSE]
    } else {
      z  <- z[keep_rows, , drop = FALSE]
      tt <- tt[keep_rows, , drop = FALSE]

      ints <- as.integer(keep_rows)
      rownames(z)  <- ints
      rownames(tt) <- ints
    }
  }

  if (!is.null(start_col)) {
    keep_cols <- int2col(seq(col2int(start_col), maxCol))

    if (!all(sel <- keep_cols %in% colnames(z))) {
      keep_col <- keep_cols[!sel]

      z[keep_col]  <- NA_character_
      tt[keep_col] <- NA_integer_

      z  <- z[keep_cols]
      tt <- tt[keep_cols]
    }

    sel <- match(keep_cols, colnames(z))
    z  <- z[, sel, drop = FALSE]
    tt <- tt[, sel, drop = FALSE]
  }

  if (!is.null(cols)) {
    keep_cols <- int2col(cols)

    if (!all(keep_cols %in% colnames(z))) {
      keep_col <- keep_cols[!keep_cols %in% colnames(z)]

      z[keep_col] <- NA_character_
      tt[keep_col] <- NA_integer_
    }

    sel <- match(keep_cols, colnames(z))
    z  <- z[, sel, drop = FALSE]
    tt <- tt[, sel, drop = FALSE]
  }

  keep_rows <- intersect(keep_rows, rnams)

  # reduce data to selected cases only
  if (has_dims && length(keep_rows) && length(keep_cols))
    cc <- cc[cc$row_r %in% keep_rows & cc$c_r %in% keep_cols, ]

  cc$val <- rep_len(NA_character_, nrow(cc))
  cc$typ <- rep_len(NA_integer_, nrow(cc))

  cc_tab <- unique(cc$c_t)

  # bool
  if (any(cc_tab == "b")) {
    sel <- cc$c_t %in% "b"
    cc$val[sel] <- as.logical(as.numeric(cc$v[sel]))
    cc$typ[sel] <- 4L
  }
  # text in v
  if (any(cc_tab %in% c("str", "e"))) {
    sel <- cc$c_t %in% c("str", "e")
    cc$val[sel] <- replaceXMLEntities(cc$v[sel])
    cc$typ[sel] <- 0L
  }
  # text in t
  if (any(cc_tab == "inlineStr")) {
    sel <- cc$c_t %in% c("inlineStr")
    cc$val[sel] <- is_to_txt(cc$is[sel])
    cc$typ[sel] <- 0L
  }
  # test is sst
  if (any(cc_tab == "s")) {
    sel <- cc$c_t %in% c("s")
    cc$val[sel] <- si_to_txt(sst[as.numeric(cc$v[sel]) + 1])
    cc$typ[sel] <- 0L
  }

  has_na_string <- FALSE
  # convert missings
  if (!all(is.na(na_strings))) {
    sel <- cc$val %in% na_strings
    if (any(sel)) {
      cc$val[sel] <- NA_character_
      cc$typ[sel] <- -1L
      has_na_string <- TRUE
    }
  }

  has_na_number <- FALSE
  # convert missings.
  # at this stage we only have characters.
  na_numbers <- as.character(na_numbers)
  if (!all(is.na(na_numbers))) {
    sel <- cc$v %in% na_numbers
    if (any(sel)) {
      cc$val[sel] <- NA_character_
      cc$typ[sel] <- -2L
      has_na_number <- TRUE
    }
  }

  origin <- get_date_origin(wb)

  # dates
  if (NROW(cc) && !is.null(cc$c_s)) {

    # if a cell is t="s" the content is a sst and not da date

    all_styles <- c(xlsx_date_style, xlsx_hms_style, xlsx_posix_style)
    if (detect_dates && missing(types) && length(all_styles)) {

      uccs <- unique(cc$c_s)

      if (any(uccs %in% all_styles)) {

        strings <- c("s", "str", "b", "inlineStr")
        is_string <- !is.null(cc$c_t) & (cc$c_t %in% strings)

        is_valid_val <- !is_string & cc$v != "" & (cc$c_t != "e" | is.na(cc$c_t))

        if (any(uccs %in% xlsx_date_style)) {
          sel <- is_valid_val & (cc$c_s %in% xlsx_date_style)
          if (any(sel)) { # Only run if there are actual matches
            if (convert)
              cc$val[sel] <- date_to_unix(cc$v[sel], origin = origin)
            else
              cc$val[sel] <- as.character(convert_date(cc$v[sel], origin = origin))
            cc$typ[sel]  <- 2L
          }
        }

        if (any(uccs %in% xlsx_hms_style)) {
          sel <- is_valid_val & (cc$c_s %in% xlsx_hms_style)
          if (any(sel)) {
            if (convert) {
              cc$val[sel] <- cc$v[sel]
            } else {
              cc$val[sel] <- as.character(convert_hms(cc$v[sel]))
            }
            cc$typ[sel]  <- 5L
          }
        }

        if (any(uccs %in% xlsx_posix_style)) {
          sel <- is_valid_val & (cc$c_s %in% xlsx_posix_style)
          if (any(sel)) {
            if (convert)
              cc$val[sel] <- date_to_unix(cc$v[sel], origin = origin, datetime = TRUE)
            else
              cc$val[sel] <- as.character(convert_datetime(cc$v[sel], origin = origin))
            cc$typ[sel]  <- 3L
          }
        }
      }
    }
  }

  # remaining values are numeric?
  if (any(cc_tab %in% c("n", ""))) {
    sel <- which(is.na(cc$typ))
    cc$val[sel] <- cc$v[sel]
    cc$typ[sel] <- 1L
  }

  if (show_formula) {

    if (any(grepl("shared", cc$f_attr))) {

      # depending on the sheet, this might require updates to many cells
      # TODO reduce this to cells, that are part of `cc`. Currently we
      # might waste time, updating cells that are not visible to the user
      cc_shared <- ws$sheet_data$cc
      cc_shared$shared_fml <- rbindlist(xml_attr(paste0("<f ", cc_shared$f_attr, "/>"), "f"))$t
      cc_shared <- cc_shared[cc_shared$shared_fml == "shared", ]

      cc <- shared_as_fml(cc, cc_shared)
    }

    sel <- cc$f != ""
    cc$val[sel] <- replaceXMLEntities(cc$f[sel])
    cc$typ[sel] <- 6L

  }

  if (show_hyperlinks) {

    if (length(ws$hyperlinks)) {

      hls <- wb_to_hyperlink(wb, sheet)
      hyprlnks <- as.data.frame(
        do.call("rbind",
                lapply(hls, function(hl) {
                  c(hl$ref, ifelse(is.null(hl$target), hl$location, hl$target))
                })
        ),
        stringsAsFactors = FALSE
      )
      cc$val[match(hyprlnks$V1, cc$r)] <- hyprlnks$V2
    }

  }

  # convert "na_string" to missing
  if (has_na_string) cc$typ[cc$typ == -1] <- NA_integer_
  if (has_na_number) cc$typ[cc$typ == -2] <- NA_integer_

  if (apply_numfmts) {

    cc <- get_numfmt_style(wb, cc)
    kc <- if (!is.null(cols)) cc$c_r %in% col2int(cols) else TRUE

    not_blank_or_bool_error <- cc$num_fmt != "" & !cc$c_t %in% c("b", "e") & kc

    # apply_numfmt expects numeric, character or date/posixct
    sel <- not_blank_or_bool_error & cc$typ %in% c(1L, 4L)
    if (any(sel)) {
      cc$val[sel] <- apply_numfmt(as.numeric(cc$val[sel]), cc$num_fmt[sel])
      cc$typ[sel] <- 0L
    }
    sel <- not_blank_or_bool_error & cc$typ %in% c(0L, 2L, 3L, 5L)
    if (any(sel)) {
      cc$val[sel] <- apply_numfmt(cc$val[sel], cc$num_fmt[sel])
      cc$typ[sel] <- 0L
    }
  }

  # prepare to create output object z
  # we need to create the correct col and row position as integer starting at 0. Because we allow
  # to select specific rows and columns, we must make sure that our zz cols and rows matches the
  # z data frame.
  zz <- data.frame(
    val  = cc$val,
    typ  = cc$typ,
    cols = match(cc$c_r, colnames(z)) - 1L,
    rows = match(cc$row_r, rownames(z)) - 1L,
    stringsAsFactors = FALSE
  )

  # zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
  if (any(zz$val == "", na.rm = TRUE)) zz <- zz[zz$val != "", ]
  long_to_wide(z, tt, zz)

  # backward compatible option. get the mergedCells dimension and fill it with
  # the value of the first cell in the range. do the same for tt.
  if (fill_merged_cells) {
    mc <- ws$mergeCells
    if (length(mc)) {

      mc <- unlist(xml_attr(mc, "mergeCell"))

      for (i in seq_along(mc)) {
        filler <- stringi::stri_split_fixed(mc[i], pattern = ":")[[1]][1]


        dms <- dims_to_dataframe(mc[i])

        if (any(row_sel <- rownames(z) %in% rownames(dms)) &&
            any(col_sel <- colnames(z) %in% colnames(dms))) {

          # TODO there probably is a better way in not reducing cc above, so
          # that we do not have to go through large xlsx files multiple times
          z_fill <- wb_to_df(
            file            = wb,
            sheet           = sheet,
            dims            = filler,
            na              = list(
              strings = na_strings,
              numbers = na_numbers
            ),
            convert         = FALSE,
            col_names       = FALSE,
            detect_dates    = detect_dates,
            show_formula    = show_formula,
            keep_attributes = TRUE
          )

          tt_fill <- attr(z_fill, "tt")

          z[row_sel,  col_sel] <- z_fill
          tt[row_sel, col_sel] <- tt_fill
        }
      }
    }

  }

  # the following two skip hidden columns and row and need a valid keep_rows and
  # keep_cols length.
  if (skip_hidden_rows) {
    sel <- row_attr$hidden == "1" | row_attr$hidden == "true"
    if (any(sel)) {
      hide   <- !keep_rows %in% row_attr$r[sel]

      z  <- z[hide, , drop = FALSE]
      tt <- tt[hide, , drop = FALSE]
    }
  }

  if (skip_hidden_cols) {
    col_attr <- ws$unfold_cols()
    sel <- col_attr$hidden == "1" | col_attr$hidden == "true"
    if (any(sel)) {
      hide     <- col2int(keep_cols) %in% as.integer(col_attr$min[sel])

      z[hide]  <- NULL
      tt[hide] <- NULL
    }
  }

  # is.na needs convert
  if (skip_empty_rows) {
    empty <- vapply(seq_len(nrow(z)), function(x) all(is.na(z[x, ])), NA)

    z  <- z[!empty, , drop = FALSE]
    tt <- tt[!empty, , drop = FALSE]
  }

  if (skip_empty_cols) {

    empty <- vapply(z, function(x) all(is.na(x)), NA)

    if (any(empty)) {
      sel <- which(empty)
      z[sel]  <- NULL
      tt[sel] <- NULL
    }

  }

  # prepare colnames object
  xlsx_cols_names <- colnames(z)
  names(xlsx_cols_names) <- xlsx_cols_names


  date_conv     <- function(x) as.Date(.POSIXct(as.double(x), "UTC"), tz = "UTC", origin = "1970-01-01")
  datetime_conv <- function(x) .POSIXct(as.double(x), "UTC")
  hms_conv      <- convert_hms

  # if colNames, then change tt too. rownames will be converted later. If column name row
  # is in z/tt, the column name guessing will fail below
  if (col_names) {
    # select first row as colnames, but do not yet assign. it might contain
    # missing values and if assigned, convert below might break with unambiguous
    # names.

    z_head  <- df_1(z)
    tt_head <- df_1(tt)

    nams <- names(xlsx_cols_names)
    if (convert && ncol(z))
      xlsx_cols_names <- convert_df(z_head, guess_col_type(tt_head), date_conv, datetime_conv, hms_conv, as_character = TRUE, col_names = TRUE)
    else
      xlsx_cols_names <- z_head
    names(xlsx_cols_names) <- nams

    z  <- z[-1, , drop = FALSE]
    tt <- tt[-1, , drop = FALSE]
  }

  # # faster guess_col_type alternative? to avoid tt
  # types <- ftable(cc$row_r ~ cc$c_r ~ cc$typ)

  if (missing(types)) {
    types <- guess_col_type(tt)
  } else {
    # TODO check if guessing only if !all() is possible
    if (any(xlsx_cols_names %in% names(types))) {

      if (is.character(types)) {
        types[types == "character"] <- 0
        types[types == "numeric"]   <- 1
        types[types == "Date"]      <- 2
        types[types == "POSIXct"]   <- 3
        types[types == "logical"]   <- 4
        types[types == "hms"]       <- 5
        types[types == "formula"]   <- 6
      }

      if (!all(names(types) %in% xlsx_cols_names)) {
        warning("variable from `types` not found in data")
        types <- types[names(types) %in% xlsx_cols_names]
      }

      # assign types the correct column name "A", "B" etc.
      names(types) <- names(xlsx_cols_names[match(names(types), xlsx_cols_names)])

      # replace predefined types in guessed column types
      guess <- guess_col_type(tt)
      guess[names(types)] <- types
      types <- guess
    } else {
      stop("no variable from `types` found in data")
    }

    # avoid multiple conversion
    date_conv     <- function(x) convert_date(x, origin = origin)
    datetime_conv <- function(x) convert_datetime(x, origin = origin)
  }

  # could make it optional or explicit
  if (convert && ncol(z)) {
    z <- convert_df(z, types, date_conv, datetime_conv, hms_conv)

    ## this reduces the difference to releases < 1.15. If in mixed columns
    ## conversion to date fails and a character frame is returned, we return
    ## a character instead of the unix time stamp as character.
    if (detect_dates) {
      date_conv_c     <- function(...) as.character(date_conv(...))
      datetime_conv_c <- function(...) as.character(datetime_conv(...))
      hms_conv_c      <- function(...) as.character(hms_conv(...))

      sel <- !is.na(names(types))
      # update only if types is character
      chrs <- names(which(types[sel] == 0))

      for (chr in chrs) {
        not_na_chr <- !is.na(z[[chr]])
        sel <- tt[[chr]] == 2L & not_na_chr
        if (length(sel)) {
          z[[chr]][sel] <- vapply(z[[chr]][sel], date_conv_c, NA_character_)
        }

        sel <- tt[[chr]] == 3L & not_na_chr
        if (length(sel)) {
          z[[chr]][sel] <- vapply(z[[chr]][sel], datetime_conv_c, NA_character_)
        }

        sel <- tt[[chr]] == 5L & not_na_chr
        if (length(sel)) {
          z[[chr]][sel] <- vapply(z[[chr]][sel], hms_conv_c, NA_character_)
        }
      }
    }
  }

  # column names were picked earlier
  if (row_names) {
    rownames(z)  <- z[, 1]
    rownames(tt) <- z[, 1]
    xlsx_cols_names <- xlsx_cols_names[-1]

    z  <- z[, -1, drop = FALSE]
    tt <- tt[, -1, drop = FALSE]
  }

  if (col_names) {
    if (check_names) {
      xlsx_cols_names <- make.names(xlsx_cols_names, unique = TRUE)
    }

    names(z)  <- xlsx_cols_names
    names(tt) <- xlsx_cols_names
  }

  if (keep_attributes) {
    attr(z, "tt") <- tt
    attr(z, "types") <- types
    # attr(z, "sd") <- sd
    if (!missing(named_region)) attr(z, "dn") <- nr
  }
  z
}

# `read_xlsx()` -----------------------------------------------------------------
# Ignored by roxygen2 when combining documentation
# #' Read from an input file or Workbook object
#' @rdname wb_to_df
#' @export
read_xlsx <- function(
  file,
  sheet,
  start_row         = NULL,
  start_col         = NULL,
  row_names         = FALSE,
  col_names         = TRUE,
  skip_empty_rows   = FALSE,
  skip_empty_cols   = FALSE,
  rows              = NULL,
  cols              = NULL,
  detect_dates      = TRUE,
  named_region,
  na                = "#N/A",
  fill_merged_cells = FALSE,
  check_names       = FALSE,
  show_hyperlinks   = FALSE,
  ...
) {

  # keep sheet missing // read_xlsx is the function to replace.
  # dont mess with wb_to_df
  if (missing(file))
    file <- substitute()

  if (missing(sheet))
    sheet <- substitute()

  wb_to_df(
    file              = file,
    sheet             = sheet,
    start_row         = start_row,
    start_col         = start_col,
    row_names         = row_names,
    col_names         = col_names,
    skip_empty_rows   = skip_empty_rows,
    skip_empty_cols   = skip_empty_cols,
    rows              = rows,
    cols              = cols,
    detect_dates      = detect_dates,
    named_region      = named_region,
    na                = na,
    fill_merged_cells = fill_merged_cells,
    check_names       = check_names,
    show_hyperlinks   = show_hyperlinks,
    ...               = ...
  )
}

# `wb_read()` ------------------------------------------------------------------
#' @rdname wb_to_df
#' @export
wb_read <- function(
  file,
  sheet           = 1,
  start_row       = NULL,
  start_col       = NULL,
  row_names       = FALSE,
  col_names       = TRUE,
  skip_empty_rows = FALSE,
  skip_empty_cols = FALSE,
  rows            = NULL,
  cols            = NULL,
  detect_dates    = TRUE,
  named_region,
  na              = "NA",
  check_names     = FALSE,
  show_hyperlinks = FALSE,
  ...
) {

  # keep sheet missing // read_xlsx is the function to replace.
  # dont mess with wb_to_df
  if (missing(file))
    file <- substitute()

  if (missing(sheet))
    sheet <- substitute()

  wb_to_df(
    file            = file,
    sheet           = sheet,
    start_row       = start_row,
    start_col       = start_col,
    row_names       = row_names,
    col_names       = col_names,
    skip_empty_rows = skip_empty_rows,
    skip_empty_cols = skip_empty_cols,
    rows            = rows,
    cols            = cols,
    detect_dates    = detect_dates,
    named_region    = named_region,
    na              = na,
    check_names     = check_names,
    show_hyperlinks = show_hyperlinks,
    ...             = ...
  )

}

#' Add the `wb_data` attribute to a data frame in a worksheet
#'
#' provide wb_data object as mschart input
#'
#' @param wb a workbook
#' @param sheet a sheet in the workbook either name or index
#' @param dims the dimensions
#' @param ... additional arguments for `wb_to_df()`. Be aware that not every
#' argument is valid.
#' @returns A data frame of class `wb_data`.
#' @seealso [wb_to_df()] [wb_add_mschart()], [wb_add_pivot_table()]
#' @examples
#'  wb <- wb_workbook()
#'  wb <- wb_add_worksheet(wb)
#'  wb <- wb_add_data(wb, x = mtcars, dims = "B2")
#'
#'  wb_data(wb, 1, dims = "B2:E6")
#' @export
wb_data <- function(wb, sheet = current_sheet(), dims, ...) {
  assert_workbook(wb)
  sheetno <- wb$clone()$.__enclos_env__$private$get_sheet_index(sheet)
  sheetname <- wb$get_sheet_names(escape = TRUE)[[sheetno]]

  if (missing(dims)) {
    dims <- unlist(xml_attr(wb$worksheets[[sheetno]]$dimension, "dimension"), use.names = FALSE)
  }

  z <- wb_to_df(wb, sheet, dims = dims, ...)
  attr(z, "dims")  <- dims_to_dataframe(dims, fill = TRUE, empty_rm = TRUE)
  attr(z, "sheet") <- sheetname

  class(z) <- c("wb_data", "data.frame")
  z
}

#' Extract or Replace Parts of an `wb_data` Object
#' @method [ wb_data
#' @param x x
#' @param i i
#' @param j j
#' @param drop drop
#' @rdname wb_data
#' @export
"[.wb_data" <- function(x, i, j, drop = !((missing(j) && length(i) > 1) || (!missing(i) && length(j) > 1))) {

  sheet <- attr(x, "sheet")
  dd    <- attr(x, "dims")

  class(x) <- "data.frame"

  has_colnames <- as.integer(nrow(dd) - nrow(x))

  if (missing(j) && is.character(i)) {
    j <- match(i, colnames(x))
    i <- seq_len(nrow(x))
  }

  if (missing(i)) {
    i <- seq_len(nrow(x))
  }

  if (missing(j)) {
    j <- seq_along(x)
  }

  x <- x[i, j, drop]

  if (inherits(x, "data.frame")) {

    # we have the colnames in the first row
    if (all(i < 0)) {
      sel <- seq_len(nrow(dd))
      i <- sel[!sel %in% (abs(i) + has_colnames)]
    } else {
      i <- c(1, i + has_colnames)
    }

    dd <- dd[i, j, drop]
    attr(x, "dims")  <- dd
    attr(x, "sheet") <- sheet

    class(x) <- c("wb_data", "data.frame")
  }

  x
}

Try the openxlsx2 package in your browser

Any scripts or data that you put into this service are public.

openxlsx2 documentation built on March 7, 2026, 5:06 p.m.