R/import_list.R

Defines functions .read_file_as_list .read_multiple_files_as_list .strip_exts import_list

Documented in import_list

#' @title Import list of data frames
#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zipped directory in a zip file, or HTML file)
#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip file, or HTML file), or a vector of file paths for multiple files to be imported.
#' @param which If `file` is a single file path, this specifies which objects should be extracted (passed to [import()]'s `which` argument). Ignored otherwise.
#' @param rbind A logical indicating whether to pass the import list of data frames through [data.table::rbindlist()].
#' @param rbind_label If `rbind = TRUE`, a character string specifying the name of a column to add to the data frame indicating its source file.
#' @param rbind_fill If `rbind = TRUE`, a logical indicating whether to set the `fill = TRUE` (and fill missing columns with `NA`).
#' @param \dots Additional arguments passed to [import()]. Behavior may be unexpected if files are of different formats.
#' @inheritParams import
#' @return If `rbind=FALSE` (the default), a list of a data frames. Otherwise, that list is passed to [data.table::rbindlist()] with `fill = TRUE` and returns a data frame object of class set by the `setclass` argument; if this operation fails, the list is returned.
#' @details When file is a vector of file paths and any files are missing, those files are ignored (with warnings) and this function will not raise any error.
#' @examples
#' ## For demo, a temp. file path is created with the file extension .xlsx
#' xlsx_file <- tempfile(fileext = ".xlsx")
#' export(
#'     list(
#'         mtcars1 = mtcars[1:10, ],
#'         mtcars2 = mtcars[11:20, ],
#'         mtcars3 = mtcars[21:32, ]
#'     ),
#'     xlsx_file
#' )
#'
#' # import a single file from multi-object workbook
#' import(xlsx_file, sheet = "mtcars1")
#' # import all worksheets, the return value is a list
#' import_list(xlsx_file)
#'
#' # import and rbind all worksheets, the return value is a data frame
#' import_list(xlsx_file, rbind = TRUE)
#' @seealso [import()], [export_list()], [export()]
#' @export
import_list <- function(file, setclass = getOption("rio.import.class", "data.frame"), which, rbind = FALSE,
                        rbind_label = "_file", rbind_fill = TRUE, ...) {
    .check_file(file, single_only = FALSE)
    ## special cases
    if (length(file) == 1) {
        x <- .read_file_as_list(file = file, which = which, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...)
    } else {
        ## note the plural
        x <- .read_multiple_files_as_list(files = file, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...)
    }
    ## optionally rbind
    if (isTRUE(rbind)) {
        if (length(x) == 1) {
            x <- x[[1L]]
        } else {
            x2 <- try(data.table::rbindlist(x, fill = rbind_fill), silent = TRUE)
            if (inherits(x2, "try-error")) {
                warning("Attempt to rbindlist() the data did not succeed. List returned instead.", call. = FALSE)
                return(x)
            } else {
                x <- x2
            }
        }
        x <- set_class(x, class = setclass)
    }
    return(x)
}

.strip_exts <- function(file) {
    vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1))
}

.read_multiple_files_as_list <- function(files, setclass, rbind, rbind_label, ...) {
    names(files) <- .strip_exts(files)
    x <- lapply(files, function(thisfile) {
        out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE)
        if (inherits(out, "try-error")) {
            warning(sprintf("Import failed for %s", thisfile), call. = FALSE)
            ##out <- NULL
            return(NULL)
        } else if (isTRUE(rbind)) {
            out[[rbind_label]] <- thisfile
        }
        structure(out, filename = thisfile)
    })
    names(x) <- names(files)
    return(x)
}

.read_file_as_list <- function(file, which, setclass, rbind, rbind_label, ...) {
    if (R.utils::isUrl(file)) {
        file <- remote_to_local(file)
    }
    if (get_info(file)$format == "rdata") {
        e <- new.env()
        load(file, envir = e)
        return(as.list(e))
    }
    if (!get_info(file)$format %in% c("html", "xlsx", "xls", "zip")) {
        which <- 1
        whichnames <- NULL
    }
    ## getting list of `whichnames`
    if (get_info(file)$format == "html") {
        .check_pkg_availability("xml2")
        tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table")
        if (missing(which)) {
            which <- seq_along(tables)
        }
        whichnames <- vapply(xml2::xml_attrs(tables[which]),
            function(x) if ("class" %in% names(x)) x["class"] else "",
            FUN.VALUE = character(1)
        )
        names(which) <- whichnames
    }
    if (get_info(file)$format %in% c("xls", "xlsx")) {
        ## .check_pkg_availability("readxl")
        whichnames <- readxl::excel_sheets(path = file)
        if (missing(which)) {
            which <- seq_along(whichnames)
            names(which) <- whichnames
        } else if (is.character(which)) {
            whichnames <- which
        } else {
            whichnames <- whichnames[which]
        }
    }
    if (get_info(file)$format %in% c("zip")) {
        if (missing(which)) {
            whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
            which <- seq_along(whichnames)
            names(which) <- .strip_exts(whichnames)
        } else if (is.character(which)) {
            whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
            whichnames <- whichnames[whichnames %in% which]
        } else {
            whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
            names(which) <- .strip_exts(whichnames)
        }
    }
    ## reading all `whichnames`
    x <- lapply(which, function(thiswhich) {
        out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE)
        if (inherits(out, "try-error")) {
            warning(sprintf("Import failed for %s from %s", thiswhich, file))
            out <- NULL
        } else if (isTRUE(rbind) && length(which) > 1) {
            out[[rbind_label]] <- thiswhich
        }
        out
    })
    names(x) <- whichnames
    return(x)
}
leeper/rio documentation built on April 8, 2024, 9:49 p.m.