Nothing
# Internal function to convert data frame from character to whatever is required
convert_df <- function(z, types, date_conv, datetime_conv, hms_conv, as_character = FALSE, col_names = FALSE) {
type_vals <- types[!is.na(names(types))]
if (length(type_vals) == 0) {
warning("could not convert. All missing in row used for variable names")
return(z)
}
if (col_names) {
# avoid scientific notation in column names
op <- default_save_opt()
on.exit(options(op), add = TRUE)
}
nums <- which(type_vals == 1)
dtes <- which(type_vals == 2)
poxs <- which(type_vals == 3)
logs <- which(type_vals == 4)
difs <- which(type_vals == 5)
fmls <- which(type_vals == 6)
if (as_character) {
if (length(nums)) z[nums] <- lapply(z[nums], function(i) as.character(convert_num(i)))
if (length(dtes)) z[dtes] <- lapply(z[dtes], function(i) as.character(date_conv(i)))
if (length(poxs)) z[poxs] <- lapply(z[poxs], function(i) as.character(datetime_conv(i)))
if (length(logs)) z[logs] <- lapply(z[logs], function(i) as.character(as.logical(i)))
if (length(difs)) z[difs] <- lapply(z[difs], function(i) as.character(hms_conv(i)))
} else {
if (length(nums)) z[nums] <- lapply(z[nums], convert_num)
if (length(dtes)) z[dtes] <- lapply(z[dtes], date_conv)
if (length(poxs)) z[poxs] <- lapply(z[poxs], datetime_conv)
if (length(logs)) z[logs] <- lapply(z[logs], as.logical)
if (length(difs)) z[difs] <- lapply(z[difs], hms_conv)
}
if (length(fmls)) {
for (i in fmls) {
class(z[[i]]) <- c(class(z[[i]]), "formula")
}
}
z
}
# `wb_to_df()` ----------------------------------------
#' Create a data frame from a Workbook
#'
#' @description
#' The `wb_to_df()` function is the primary interface for extracting data from
#' spreadsheet files into R. It interprets the underlying XML structure of a
#' worksheet to reconstruct a data frame, handling cell types, dimensions, and
#' formatting according to user specification. While `read_xlsx()` and
#' `wb_read()` are available as streamlined internal wrappers for users
#' accustomed to other spreadsheet packages, wb_to_df() serves as the
#' foundational function and provides the most comprehensive access to the
#' package's data extraction and configuration parameters.
#'
#' @details
#' The function extracts data based on a defined range or the total data extent
#' of a worksheet. If `col_names = TRUE`, the first row of the selection is
#' treated as the header; otherwise, spreadsheet column letters are used. If
#' `row_names = TRUE`, the first column of the selected range is assigned to
#' the data frame's row names.
#'
#' Dimension selection is highly flexible. The `dims` argument supports standard
#' "A1:B2" notation as well as dynamic wildcards for rows and columns. Using
#' `++` or `--` allows ranges to adapt to the spreadsheet's content. For
#' instance, `dims = "A2:C+"` reads from A2 to the last available row in
#' column C, while `dims = "A-:+9"` reads from the first populated row in
#' column A to the last column in row 9. If neither `dims` nor `named_region`
#' is provided, the function automatically calculates the range based on the
#' minimum and maximum populated cells, modified by `start_row` and `start_col`.
#'
#' Type conversion is governed by an internal guessing engine. If `detect_dates`
#' is enabled, serial dates are converted to R Date or POSIXct objects. All
#' datetimes are standardized to UTC. The function's handling of time variables
#' depends on the presence of the `hms` package; if loaded, `wb_to_df()` returns
#' `hms` variables. Otherwise, they are returned as string variables in
#' `hh:mm:ss` format. Users can provide explicit column types via the `types`
#' argument using numeric codes: 0 (character), 1 (numeric), 2 (Date), 3 (POSIXct),
#' 4 (logical), 5 (hms), and 6 (formula).
#'
#' Regarding formulas, it is important to note that `wb_to_df()` will not
#' automatically evaluate formulas added to a workbook object via
#' [wb_add_formula()]. In the underlying spreadsheet XML, only the formula
#' expression is written; the resulting value is typically generated by the
#' spreadsheet software's calculation engine when the file is opened and saved.
#' Consequently, reading a newly added formula cell without prior evaluation in
#' external software will result in an empty value unless `show_formula = TRUE`
#' is used to retrieve the formula string itself.
#'
#' If `keep_attributes` is TRUE, the data frame is returned with additional
#' metadata. This includes the internal type-guessing table (`tt`), which
#' identifies the derived type for every cell in the range, and the specific
#' `types` vector used for conversion. These attributes are useful for
#' debugging or for applications requiring precise knowledge of the
#' spreadsheet's original cell metadata.
#'
#' Specialized spreadsheet features include the ability to extract hyperlink
#' targets (`show_hyperlinks = TRUE`) instead of display text. For complex
#' layouts, `fill_merged_cells` propagates the value of a top-left merged cell
#' to all cells within the merge range. The `na` argument supports sophisticated
#' missing value definitions, accepting either a character vector or a named
#' list to differentiate between string and numeric `NA` types.
#'
#' @section Notes:
#' Recent versions of `openxlsx2` have introduced several changes to the
#' `wb_to_df()` API:
#' * Legacy arguments such as `na.strings` and `na.numbers` are no longer part
#' of the public API and have been consolidated into the `na` argument.
#' * As of version 1.15, all datetime variables are imported with the
#' timezone set to "UTC" to prevent system-specific local timezone shifts.
#' * The function now supports reverse-order or specific-order imports when
#' a numeric vector is passed to the `rows` argument.
#'
#' For extensive real-world examples and advanced usage patterns, consult
#' the package vignettes—specifically "openxlsx2 read to data frame"—and
#' the dedicated chapter in the `openxlsx2` book for real-life case studies.
#'
#' @param file A workbook file path, a [wbWorkbook] object, or a valid URL.
#' @param sheet The name or index of the worksheet to read. Defaults to the first sheet.
#' @param start_row,start_col Optional numeric values specifying the first row or column
#' to begin data discovery.
#' @param row_names Logical; if TRUE, uses the first column of the selection as row names.
#' @param col_names Logical; if TRUE, uses the first row of the selection as column headers.
#' @param skip_empty_rows,skip_empty_cols Logical; if TRUE, filters out rows or
#' columns containing only missing values.
#' @param skip_hidden_rows,skip_hidden_cols Logical; if TRUE, excludes rows or
#' columns marked as hidden in the worksheet metadata.
#' @param rows,cols Optional numeric vectors specifying the exact indices to read.
#' @param detect_dates Logical; if TRUE, identifies date and datetime styles for conversion.
#' @param na A character vector or a named list (e.g., `list(strings = "", numbers = -99)`)
#' defining values to treat as `NA`.
#' @param fill_merged_cells Logical; if TRUE, propagates the top-left value of a
#' merged range to all cells in that range.
#' @param dims A character string defining the range. Supports wildcards
#' (e.g., "A1:++" or "A-:+5").
#' @param named_region A character string referring to a defined name or spreadsheet Table.
#' @param show_formula Logical; if TRUE, returns the formula strings instead of
#' calculated values.
#' @param convert Logical; if TRUE, attempts to coerce columns to appropriate R classes.
#' @param types A named vector (numeric or character) to explicitly define column types.
#' @param keep_attributes Logical; if TRUE, attaches metadata such as the internal
#' type table (tt) and types as attributes to the output.
#' @param check_names Logical; if TRUE, ensures column names are syntactically
#' valid R names via [make.names()].
#' @param show_hyperlinks Logical; if TRUE, replaces cell values with their
#' underlying hyperlink targets.
#' @param apply_numfmts Logical; if TRUE, applies spreadsheet number formatting
#' and returns strings.
#' @param ... Additional arguments passed to internal methods.
#'
#' @examples
#' ###########################################################################
#' # numerics, dates, missings, bool and string
#' example_file <- system.file("extdata", "openxlsx2_example.xlsx", package = "openxlsx2")
#' wb1 <- wb_load(example_file)
#'
#' # import workbook
#' wb_to_df(wb1)
#'
#' # do not convert first row to column names
#' wb_to_df(wb1, col_names = FALSE)
#'
#' # do not try to identify dates in the data
#' wb_to_df(wb1, detect_dates = FALSE)
#'
#' # return the underlying spreadsheet formula instead of their values
#' wb_to_df(wb1, show_formula = TRUE)
#'
#' # read dimension without colNames
#' wb_to_df(wb1, dims = "A2:C5", col_names = FALSE)
#'
#' # read selected cols
#' wb_to_df(wb1, cols = c("A:B", "G"))
#'
#' # read selected rows
#' wb_to_df(wb1, rows = c(2, 4, 6))
#'
#' # convert characters to numerics and date (logical too?)
#' wb_to_df(wb1, convert = FALSE)
#'
#' # erase empty rows from dataset
#' wb_to_df(wb1, skip_empty_rows = TRUE)
#'
#' # erase empty columns from dataset
#' wb_to_df(wb1, skip_empty_cols = TRUE)
#'
#' # convert first row to rownames
#' wb_to_df(wb1, sheet = 2, dims = "C6:G9", row_names = TRUE)
#'
#' # define type of the data.frame
#' wb_to_df(wb1, cols = c(2, 5), types = c("Var1" = 0, "Var3" = 1))
#'
#' # start in row 5
#' wb_to_df(wb1, start_row = 5, col_names = FALSE)
#'
#' # na string
#' wb_to_df(wb1, na = "a")
#'
#' # read names from row two and data starting from row 4
#' wb_to_df(wb1, dims = "B2:C2,B4:C+")
#'
#' ###########################################################################
#' # Named regions
#' file_named_region <- system.file("extdata", "namedRegions3.xlsx", package = "openxlsx2")
#' wb2 <- wb_load(file_named_region)
#'
#' # read dataset with named_region (returns global first)
#' wb_to_df(wb2, named_region = "MyRange", col_names = FALSE)
#'
#' # read named_region from sheet
#' wb_to_df(wb2, named_region = "MyRange", sheet = 4, col_names = FALSE)
#'
#' # read_xlsx() and wb_read()
#' example_file <- system.file("extdata", "openxlsx2_example.xlsx", package = "openxlsx2")
#' read_xlsx(file = example_file)
#' df1 <- wb_read(file = example_file, sheet = 1)
#' df2 <- wb_read(file = example_file, sheet = 1, rows = c(1, 3, 5), cols = 1:3)
#' @export
wb_to_df <- function(
file,
sheet,
start_row = NULL,
start_col = NULL,
row_names = FALSE,
col_names = TRUE,
skip_empty_rows = FALSE,
skip_empty_cols = FALSE,
skip_hidden_rows = FALSE,
skip_hidden_cols = FALSE,
rows = NULL,
cols = NULL,
detect_dates = TRUE,
na = "#N/A",
fill_merged_cells = FALSE,
dims,
show_formula = FALSE,
convert = TRUE,
types,
named_region,
keep_attributes = FALSE,
check_names = FALSE,
show_hyperlinks = FALSE,
apply_numfmts = FALSE,
...
) {
arguments <- c(ls(), "na.strings", "na.numbers", "xlsx_file")
standardize_case_names(..., arguments = arguments)
na_strings <- NULL
na_numbers <- NA
if (is.character(na)) {
na_strings <- na
}
if (is.list(na)) {
na_strings <- na$strings
na_numbers <- na$numbers %||% NA
}
if (apply_numfmts) convert <- FALSE
args <- list(...)
if (any(c("na.strings", "na.numbers") %in% names(args))) {
# # Its a little premature to activate this
# if (getOption("openxlsx2.soon_deprecated", default = FALSE)) {
# msg <- paste0(
# "na.strings and na.numbers should be combined in a named list,",
# "`wb_to_df(na = list(strings = '#N/A', numbers = 999)`"
# )
# warning(msg, call. = FALSE)
# }
if ("na.strings" %in% names(args)) na_strings <- args[["na.strings"]]
if ("na.numbers" %in% names(args)) na_numbers <- args[["na.numbers"]]
}
xlsx_file <- args$xlsx_file
if (!is.null(xlsx_file)) {
.Deprecated(old = "xlsx_file", new = "file", package = "openxlsx2")
file <- xlsx_file %||% file
}
if (!is.null(cols)) cols <- col2int(cols)
if (inherits(file, "wbWorkbook")) {
wb <- file
} else {
# passes missing further on
if (missing(sheet))
sheet <- substitute()
data_only <- TRUE
# TODO tables and hyperlinks are deeper embedded into the wb_load code
if (!missing(named_region) || show_hyperlinks) data_only <- FALSE
# possible false positive on current lintr runs
wb <- wb_load(file, sheet = sheet, data_only = data_only) # nolint
}
if (!missing(named_region)) {
nr <- wb$get_named_regions(tables = TRUE)
if ((named_region %in% nr$name) && missing(sheet)) {
sel <- nr[nr$name == named_region, ][1, ]
sheet <- sel$sheet
dims <- sel$coords
} else if (named_region %in% nr$name) {
sel <- nr[nr$name == named_region & nr$sheet == wb_validate_sheet(wb, sheet), ]
if (NROW(sel) == 0) {
stop("no such named_region on selected sheet")
}
dims <- sel$coords
} else {
stop("no such named_region")
}
}
if (missing(sheet)) {
# TODO default sheet as 1
sheet <- 1
}
if (is.factor(sheet)) {
sheet <- as.character(sheet)
}
if (is.character(sheet)) {
sheet <- wb_validate_sheet(wb, sheet)
}
if (is.na(sheet)) {
stop("sheet not found. available sheets are: \n", paste0(wb$get_sheet_names(), collapse = ", "))
}
ws <- wb$worksheets[[sheet]]
# the sheet has no data
if (is.null(ws$sheet_data$cc) ||
nrow(ws$sheet_data$cc) == 0) {
# TODO do we need more checks or do we need to initialize a new cc object?
message("sheet found, but contains no data")
return(NULL)
}
# # Should be available, but is optional according to openxml-2.8.1. Still some
# # third party applications are known to require it. Maybe make using
# # dimensions an optional parameter?
# if (missing(dims))
# dims <- getXML1attr_one(ws$dimension,
# "dimension",
# "ref")
# If no dims are requested via named_region, simply construct them from min
# and max columns and row found on worksheet
# TODO it would be useful to have both named_region and dims?
has_dims <- TRUE
if (missing(named_region) && missing(dims)) {
has_dims <- FALSE
sd <- ws$sheet_data$cc[c("row_r", "c_r")]
row <- range(as.integer(unique(sd$row_r)))
col <- range(col2int(unique(sd$c_r)))
if (!is.null(start_row) && as.integer(start_row) < row[1])
row[1] <- start_row
if (!is.null(start_col) && col2int(start_col) < col[1])
col[1] <- start_col
if (row[1] > row[2]) row[2] <- row[1]
if (col[1] > col[2]) col[2] <- col[1]
dims <- paste0(int2col(col[1]), row[1], ":",
int2col(col[2]), row[2])
}
row_attr <- ws$sheet_data$row_attr
cc <- ws$sheet_data$cc
sst <- wb$sharedStrings
rnams <- row_attr$r
rnams <- rnams[rnams %in% unique(cc$row_r)] # avoid blank row interference
numfmt_date <- numfmt_is_date(wb$styles_mgr$styles$numFmts)
xlsx_date_style <- style_is_date(wb$styles_mgr$styles$cellXfs, numfmt_date)
# exclude if year, month or day are suspected
numfmt_hms <- numfmt_is_hms(wb$styles_mgr$styles$numFmts)
xlsx_hms_style <- style_is_hms(wb$styles_mgr$styles$cellXfs, numfmt_hms)
numfmt_posix <- numfmt_is_posix(wb$styles_mgr$styles$numFmts)
xlsx_posix_style <- style_is_posix(wb$styles_mgr$styles$cellXfs, numfmt_posix)
# create temporary data frame. hard copy required
z <- dims_to_dataframe(dims, empty_rm = TRUE, cc = cc)
tt <- create_int_dataframe(z)
keep_cols <- colnames(z)
keep_rows <- rownames(z)
maxRow <- max(as.numeric(keep_rows))
maxCol <- max(col2int(keep_cols))
if (!is.null(start_row)) {
keep_rows <- as.character(seq(start_row, maxRow))
if (start_row <= maxRow) {
sel <- rownames(z) %in% keep_rows
z <- z[sel, , drop = FALSE]
tt <- tt[sel, , drop = FALSE]
} else {
keep_rows <- as.character(start_row)
z <- z[keep_rows, , drop = FALSE]
tt <- tt[keep_rows, , drop = FALSE]
rownames(z) <- as.integer(keep_rows)
rownames(tt) <- as.integer(keep_rows)
}
}
if (!is.null(rows)) {
keep_rows <- as.character(as.integer(rows))
if (!anyNA(sel <- match(keep_rows, rownames(z)))) {
z <- z[sel, , drop = FALSE]
tt <- tt[sel, , drop = FALSE]
} else {
z <- z[keep_rows, , drop = FALSE]
tt <- tt[keep_rows, , drop = FALSE]
ints <- as.integer(keep_rows)
rownames(z) <- ints
rownames(tt) <- ints
}
}
if (!is.null(start_col)) {
keep_cols <- int2col(seq(col2int(start_col), maxCol))
if (!all(sel <- keep_cols %in% colnames(z))) {
keep_col <- keep_cols[!sel]
z[keep_col] <- NA_character_
tt[keep_col] <- NA_integer_
z <- z[keep_cols]
tt <- tt[keep_cols]
}
sel <- match(keep_cols, colnames(z))
z <- z[, sel, drop = FALSE]
tt <- tt[, sel, drop = FALSE]
}
if (!is.null(cols)) {
keep_cols <- int2col(cols)
if (!all(keep_cols %in% colnames(z))) {
keep_col <- keep_cols[!keep_cols %in% colnames(z)]
z[keep_col] <- NA_character_
tt[keep_col] <- NA_integer_
}
sel <- match(keep_cols, colnames(z))
z <- z[, sel, drop = FALSE]
tt <- tt[, sel, drop = FALSE]
}
keep_rows <- intersect(keep_rows, rnams)
# reduce data to selected cases only
if (has_dims && length(keep_rows) && length(keep_cols))
cc <- cc[cc$row_r %in% keep_rows & cc$c_r %in% keep_cols, ]
cc$val <- rep_len(NA_character_, nrow(cc))
cc$typ <- rep_len(NA_integer_, nrow(cc))
cc_tab <- unique(cc$c_t)
# bool
if (any(cc_tab == "b")) {
sel <- cc$c_t %in% "b"
cc$val[sel] <- as.logical(as.numeric(cc$v[sel]))
cc$typ[sel] <- 4L
}
# text in v
if (any(cc_tab %in% c("str", "e"))) {
sel <- cc$c_t %in% c("str", "e")
cc$val[sel] <- replaceXMLEntities(cc$v[sel])
cc$typ[sel] <- 0L
}
# text in t
if (any(cc_tab == "inlineStr")) {
sel <- cc$c_t %in% c("inlineStr")
cc$val[sel] <- is_to_txt(cc$is[sel])
cc$typ[sel] <- 0L
}
# test is sst
if (any(cc_tab == "s")) {
sel <- cc$c_t %in% c("s")
cc$val[sel] <- si_to_txt(sst[as.numeric(cc$v[sel]) + 1])
cc$typ[sel] <- 0L
}
has_na_string <- FALSE
# convert missings
if (!all(is.na(na_strings))) {
sel <- cc$val %in% na_strings
if (any(sel)) {
cc$val[sel] <- NA_character_
cc$typ[sel] <- -1L
has_na_string <- TRUE
}
}
has_na_number <- FALSE
# convert missings.
# at this stage we only have characters.
na_numbers <- as.character(na_numbers)
if (!all(is.na(na_numbers))) {
sel <- cc$v %in% na_numbers
if (any(sel)) {
cc$val[sel] <- NA_character_
cc$typ[sel] <- -2L
has_na_number <- TRUE
}
}
origin <- get_date_origin(wb)
# dates
if (NROW(cc) && !is.null(cc$c_s)) {
# if a cell is t="s" the content is a sst and not da date
all_styles <- c(xlsx_date_style, xlsx_hms_style, xlsx_posix_style)
if (detect_dates && missing(types) && length(all_styles)) {
uccs <- unique(cc$c_s)
if (any(uccs %in% all_styles)) {
strings <- c("s", "str", "b", "inlineStr")
is_string <- !is.null(cc$c_t) & (cc$c_t %in% strings)
is_valid_val <- !is_string & cc$v != "" & (cc$c_t != "e" | is.na(cc$c_t))
if (any(uccs %in% xlsx_date_style)) {
sel <- is_valid_val & (cc$c_s %in% xlsx_date_style)
if (any(sel)) { # Only run if there are actual matches
if (convert)
cc$val[sel] <- date_to_unix(cc$v[sel], origin = origin)
else
cc$val[sel] <- as.character(convert_date(cc$v[sel], origin = origin))
cc$typ[sel] <- 2L
}
}
if (any(uccs %in% xlsx_hms_style)) {
sel <- is_valid_val & (cc$c_s %in% xlsx_hms_style)
if (any(sel)) {
if (convert) {
cc$val[sel] <- cc$v[sel]
} else {
cc$val[sel] <- as.character(convert_hms(cc$v[sel]))
}
cc$typ[sel] <- 5L
}
}
if (any(uccs %in% xlsx_posix_style)) {
sel <- is_valid_val & (cc$c_s %in% xlsx_posix_style)
if (any(sel)) {
if (convert)
cc$val[sel] <- date_to_unix(cc$v[sel], origin = origin, datetime = TRUE)
else
cc$val[sel] <- as.character(convert_datetime(cc$v[sel], origin = origin))
cc$typ[sel] <- 3L
}
}
}
}
}
# remaining values are numeric?
if (any(cc_tab %in% c("n", ""))) {
sel <- which(is.na(cc$typ))
cc$val[sel] <- cc$v[sel]
cc$typ[sel] <- 1L
}
if (show_formula) {
if (any(grepl("shared", cc$f_attr))) {
# depending on the sheet, this might require updates to many cells
# TODO reduce this to cells, that are part of `cc`. Currently we
# might waste time, updating cells that are not visible to the user
cc_shared <- ws$sheet_data$cc
cc_shared$shared_fml <- rbindlist(xml_attr(paste0("<f ", cc_shared$f_attr, "/>"), "f"))$t
cc_shared <- cc_shared[cc_shared$shared_fml == "shared", ]
cc <- shared_as_fml(cc, cc_shared)
}
sel <- cc$f != ""
cc$val[sel] <- replaceXMLEntities(cc$f[sel])
cc$typ[sel] <- 6L
}
if (show_hyperlinks) {
if (length(ws$hyperlinks)) {
hls <- wb_to_hyperlink(wb, sheet)
hyprlnks <- as.data.frame(
do.call("rbind",
lapply(hls, function(hl) {
c(hl$ref, ifelse(is.null(hl$target), hl$location, hl$target))
})
),
stringsAsFactors = FALSE
)
cc$val[match(hyprlnks$V1, cc$r)] <- hyprlnks$V2
}
}
# convert "na_string" to missing
if (has_na_string) cc$typ[cc$typ == -1] <- NA_integer_
if (has_na_number) cc$typ[cc$typ == -2] <- NA_integer_
if (apply_numfmts) {
cc <- get_numfmt_style(wb, cc)
kc <- if (!is.null(cols)) cc$c_r %in% col2int(cols) else TRUE
not_blank_or_bool_error <- cc$num_fmt != "" & !cc$c_t %in% c("b", "e") & kc
# apply_numfmt expects numeric, character or date/posixct
sel <- not_blank_or_bool_error & cc$typ %in% c(1L, 4L)
if (any(sel)) {
cc$val[sel] <- apply_numfmt(as.numeric(cc$val[sel]), cc$num_fmt[sel])
cc$typ[sel] <- 0L
}
sel <- not_blank_or_bool_error & cc$typ %in% c(0L, 2L, 3L, 5L)
if (any(sel)) {
cc$val[sel] <- apply_numfmt(cc$val[sel], cc$num_fmt[sel])
cc$typ[sel] <- 0L
}
}
# prepare to create output object z
# we need to create the correct col and row position as integer starting at 0. Because we allow
# to select specific rows and columns, we must make sure that our zz cols and rows matches the
# z data frame.
zz <- data.frame(
val = cc$val,
typ = cc$typ,
cols = match(cc$c_r, colnames(z)) - 1L,
rows = match(cc$row_r, rownames(z)) - 1L,
stringsAsFactors = FALSE
)
# zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
if (any(zz$val == "", na.rm = TRUE)) zz <- zz[zz$val != "", ]
long_to_wide(z, tt, zz)
# backward compatible option. get the mergedCells dimension and fill it with
# the value of the first cell in the range. do the same for tt.
if (fill_merged_cells) {
mc <- ws$mergeCells
if (length(mc)) {
mc <- unlist(xml_attr(mc, "mergeCell"))
for (i in seq_along(mc)) {
filler <- stringi::stri_split_fixed(mc[i], pattern = ":")[[1]][1]
dms <- dims_to_dataframe(mc[i])
if (any(row_sel <- rownames(z) %in% rownames(dms)) &&
any(col_sel <- colnames(z) %in% colnames(dms))) {
# TODO there probably is a better way in not reducing cc above, so
# that we do not have to go through large xlsx files multiple times
z_fill <- wb_to_df(
file = wb,
sheet = sheet,
dims = filler,
na = list(
strings = na_strings,
numbers = na_numbers
),
convert = FALSE,
col_names = FALSE,
detect_dates = detect_dates,
show_formula = show_formula,
keep_attributes = TRUE
)
tt_fill <- attr(z_fill, "tt")
z[row_sel, col_sel] <- z_fill
tt[row_sel, col_sel] <- tt_fill
}
}
}
}
# the following two skip hidden columns and row and need a valid keep_rows and
# keep_cols length.
if (skip_hidden_rows) {
sel <- row_attr$hidden == "1" | row_attr$hidden == "true"
if (any(sel)) {
hide <- !keep_rows %in% row_attr$r[sel]
z <- z[hide, , drop = FALSE]
tt <- tt[hide, , drop = FALSE]
}
}
if (skip_hidden_cols) {
col_attr <- ws$unfold_cols()
sel <- col_attr$hidden == "1" | col_attr$hidden == "true"
if (any(sel)) {
hide <- col2int(keep_cols) %in% as.integer(col_attr$min[sel])
z[hide] <- NULL
tt[hide] <- NULL
}
}
# is.na needs convert
if (skip_empty_rows) {
empty <- vapply(seq_len(nrow(z)), function(x) all(is.na(z[x, ])), NA)
z <- z[!empty, , drop = FALSE]
tt <- tt[!empty, , drop = FALSE]
}
if (skip_empty_cols) {
empty <- vapply(z, function(x) all(is.na(x)), NA)
if (any(empty)) {
sel <- which(empty)
z[sel] <- NULL
tt[sel] <- NULL
}
}
# prepare colnames object
xlsx_cols_names <- colnames(z)
names(xlsx_cols_names) <- xlsx_cols_names
date_conv <- function(x) as.Date(.POSIXct(as.double(x), "UTC"), tz = "UTC", origin = "1970-01-01")
datetime_conv <- function(x) .POSIXct(as.double(x), "UTC")
hms_conv <- convert_hms
# if colNames, then change tt too. rownames will be converted later. If column name row
# is in z/tt, the column name guessing will fail below
if (col_names) {
# select first row as colnames, but do not yet assign. it might contain
# missing values and if assigned, convert below might break with unambiguous
# names.
z_head <- df_1(z)
tt_head <- df_1(tt)
nams <- names(xlsx_cols_names)
if (convert && ncol(z))
xlsx_cols_names <- convert_df(z_head, guess_col_type(tt_head), date_conv, datetime_conv, hms_conv, as_character = TRUE, col_names = TRUE)
else
xlsx_cols_names <- z_head
names(xlsx_cols_names) <- nams
z <- z[-1, , drop = FALSE]
tt <- tt[-1, , drop = FALSE]
}
# # faster guess_col_type alternative? to avoid tt
# types <- ftable(cc$row_r ~ cc$c_r ~ cc$typ)
if (missing(types)) {
types <- guess_col_type(tt)
} else {
# TODO check if guessing only if !all() is possible
if (any(xlsx_cols_names %in% names(types))) {
if (is.character(types)) {
types[types == "character"] <- 0
types[types == "numeric"] <- 1
types[types == "Date"] <- 2
types[types == "POSIXct"] <- 3
types[types == "logical"] <- 4
types[types == "hms"] <- 5
types[types == "formula"] <- 6
}
if (!all(names(types) %in% xlsx_cols_names)) {
warning("variable from `types` not found in data")
types <- types[names(types) %in% xlsx_cols_names]
}
# assign types the correct column name "A", "B" etc.
names(types) <- names(xlsx_cols_names[match(names(types), xlsx_cols_names)])
# replace predefined types in guessed column types
guess <- guess_col_type(tt)
guess[names(types)] <- types
types <- guess
} else {
stop("no variable from `types` found in data")
}
# avoid multiple conversion
date_conv <- function(x) convert_date(x, origin = origin)
datetime_conv <- function(x) convert_datetime(x, origin = origin)
}
# could make it optional or explicit
if (convert && ncol(z)) {
z <- convert_df(z, types, date_conv, datetime_conv, hms_conv)
## this reduces the difference to releases < 1.15. If in mixed columns
## conversion to date fails and a character frame is returned, we return
## a character instead of the unix time stamp as character.
if (detect_dates) {
date_conv_c <- function(...) as.character(date_conv(...))
datetime_conv_c <- function(...) as.character(datetime_conv(...))
hms_conv_c <- function(...) as.character(hms_conv(...))
sel <- !is.na(names(types))
# update only if types is character
chrs <- names(which(types[sel] == 0))
for (chr in chrs) {
not_na_chr <- !is.na(z[[chr]])
sel <- tt[[chr]] == 2L & not_na_chr
if (length(sel)) {
z[[chr]][sel] <- vapply(z[[chr]][sel], date_conv_c, NA_character_)
}
sel <- tt[[chr]] == 3L & not_na_chr
if (length(sel)) {
z[[chr]][sel] <- vapply(z[[chr]][sel], datetime_conv_c, NA_character_)
}
sel <- tt[[chr]] == 5L & not_na_chr
if (length(sel)) {
z[[chr]][sel] <- vapply(z[[chr]][sel], hms_conv_c, NA_character_)
}
}
}
}
# column names were picked earlier
if (row_names) {
rownames(z) <- z[, 1]
rownames(tt) <- z[, 1]
xlsx_cols_names <- xlsx_cols_names[-1]
z <- z[, -1, drop = FALSE]
tt <- tt[, -1, drop = FALSE]
}
if (col_names) {
if (check_names) {
xlsx_cols_names <- make.names(xlsx_cols_names, unique = TRUE)
}
names(z) <- xlsx_cols_names
names(tt) <- xlsx_cols_names
}
if (keep_attributes) {
attr(z, "tt") <- tt
attr(z, "types") <- types
# attr(z, "sd") <- sd
if (!missing(named_region)) attr(z, "dn") <- nr
}
z
}
# `read_xlsx()` -----------------------------------------------------------------
# Ignored by roxygen2 when combining documentation
# #' Read from an input file or Workbook object
#' @rdname wb_to_df
#' @export
read_xlsx <- function(
file,
sheet,
start_row = NULL,
start_col = NULL,
row_names = FALSE,
col_names = TRUE,
skip_empty_rows = FALSE,
skip_empty_cols = FALSE,
rows = NULL,
cols = NULL,
detect_dates = TRUE,
named_region,
na = "#N/A",
fill_merged_cells = FALSE,
check_names = FALSE,
show_hyperlinks = FALSE,
...
) {
# keep sheet missing // read_xlsx is the function to replace.
# dont mess with wb_to_df
if (missing(file))
file <- substitute()
if (missing(sheet))
sheet <- substitute()
wb_to_df(
file = file,
sheet = sheet,
start_row = start_row,
start_col = start_col,
row_names = row_names,
col_names = col_names,
skip_empty_rows = skip_empty_rows,
skip_empty_cols = skip_empty_cols,
rows = rows,
cols = cols,
detect_dates = detect_dates,
named_region = named_region,
na = na,
fill_merged_cells = fill_merged_cells,
check_names = check_names,
show_hyperlinks = show_hyperlinks,
... = ...
)
}
# `wb_read()` ------------------------------------------------------------------
#' @rdname wb_to_df
#' @export
wb_read <- function(
file,
sheet = 1,
start_row = NULL,
start_col = NULL,
row_names = FALSE,
col_names = TRUE,
skip_empty_rows = FALSE,
skip_empty_cols = FALSE,
rows = NULL,
cols = NULL,
detect_dates = TRUE,
named_region,
na = "NA",
check_names = FALSE,
show_hyperlinks = FALSE,
...
) {
# keep sheet missing // read_xlsx is the function to replace.
# dont mess with wb_to_df
if (missing(file))
file <- substitute()
if (missing(sheet))
sheet <- substitute()
wb_to_df(
file = file,
sheet = sheet,
start_row = start_row,
start_col = start_col,
row_names = row_names,
col_names = col_names,
skip_empty_rows = skip_empty_rows,
skip_empty_cols = skip_empty_cols,
rows = rows,
cols = cols,
detect_dates = detect_dates,
named_region = named_region,
na = na,
check_names = check_names,
show_hyperlinks = show_hyperlinks,
... = ...
)
}
#' Add the `wb_data` attribute to a data frame in a worksheet
#'
#' provide wb_data object as mschart input
#'
#' @param wb a workbook
#' @param sheet a sheet in the workbook either name or index
#' @param dims the dimensions
#' @param ... additional arguments for `wb_to_df()`. Be aware that not every
#' argument is valid.
#' @returns A data frame of class `wb_data`.
#' @seealso [wb_to_df()] [wb_add_mschart()], [wb_add_pivot_table()]
#' @examples
#' wb <- wb_workbook()
#' wb <- wb_add_worksheet(wb)
#' wb <- wb_add_data(wb, x = mtcars, dims = "B2")
#'
#' wb_data(wb, 1, dims = "B2:E6")
#' @export
wb_data <- function(wb, sheet = current_sheet(), dims, ...) {
assert_workbook(wb)
sheetno <- wb$clone()$.__enclos_env__$private$get_sheet_index(sheet)
sheetname <- wb$get_sheet_names(escape = TRUE)[[sheetno]]
if (missing(dims)) {
dims <- unlist(xml_attr(wb$worksheets[[sheetno]]$dimension, "dimension"), use.names = FALSE)
}
z <- wb_to_df(wb, sheet, dims = dims, ...)
attr(z, "dims") <- dims_to_dataframe(dims, fill = TRUE, empty_rm = TRUE)
attr(z, "sheet") <- sheetname
class(z) <- c("wb_data", "data.frame")
z
}
#' Extract or Replace Parts of an `wb_data` Object
#' @method [ wb_data
#' @param x x
#' @param i i
#' @param j j
#' @param drop drop
#' @rdname wb_data
#' @export
"[.wb_data" <- function(x, i, j, drop = !((missing(j) && length(i) > 1) || (!missing(i) && length(j) > 1))) {
sheet <- attr(x, "sheet")
dd <- attr(x, "dims")
class(x) <- "data.frame"
has_colnames <- as.integer(nrow(dd) - nrow(x))
if (missing(j) && is.character(i)) {
j <- match(i, colnames(x))
i <- seq_len(nrow(x))
}
if (missing(i)) {
i <- seq_len(nrow(x))
}
if (missing(j)) {
j <- seq_along(x)
}
x <- x[i, j, drop]
if (inherits(x, "data.frame")) {
# we have the colnames in the first row
if (all(i < 0)) {
sel <- seq_len(nrow(dd))
i <- sel[!sel %in% (abs(i) + has_colnames)]
} else {
i <- c(1, i + has_colnames)
}
dd <- dd[i, j, drop]
attr(x, "dims") <- dd
attr(x, "sheet") <- sheet
class(x) <- c("wb_data", "data.frame")
}
x
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.