Nothing
#' @title Collect metadata and text into a data.frame
#'
#' @description Facilitates downstream processing with text mining packages
#' by providing metadata and full text in a tidy data.frame format
#'
#' @export
#' @param path a directory path, must exist
#' @param type (character) type of files to get. Default is `NULL` which gets all types.
#' Can be one of pdf, xml, or plain (file extensions: pdf, xml, and txt, respectively)
#' @param encoding (character) encoding, if `NULL` we get it from `getOption("encoding")`
#' @param xml_extract_text (logical) for XML, should we extract the text (`TRUE`) or
#' return a string as XML (`FALSE`). Default: `TRUE`
#' @details You can alternatively use `readtext::readtext()` or similar functions
#' to achieve a similar outcome.
#' @examples \dontrun{
#' if (interactive()) {
#' ## from a directory path
#' x <- ft_table()
#' x
#'
#' ## only xml
#' ft_table(type = "xml")
#'
#' ## only pdf
#' ft_table(type = "pdf")
#'
#' ## don't pull text out of xml, just give back the xml please
#' x <- ft_table(xml_extract_text = FALSE)
#' x
#' }}
ft_table <- function(path = NULL, type = NULL, encoding = NULL, xml_extract_text = TRUE) {
if (is.null(path)) path <- cache_options_get()$path
if (!file.exists(path)) stop(path, " does not exist")
assert(path, "character")
assert(type, "character")
assert(xml_extract_text, "logical")
if (is.null(encoding)) encoding <- getOption("encoding")
pattern <- type
if (!is.null(type)) {
if (!type %in% c('pdf', 'xml')) stop("'type' must be one of 'xml', 'pdf', or 'plain'")
pattern <- paste0(".", switch(type, xml = "xml", pdf = "pdf", plain = "txt"))
}
paths <- list.files(path, full.names = TRUE, pattern = pattern)
out <- vapply(paths, reader, character(1),
encoding = encoding, xml_extract_text = xml_extract_text)
doidf <- dois_lookup(gsub("\\..+", "", basename(paths)))
tibble::as_tibble(data.frame(doidf, text = out, paths, stringsAsFactors = FALSE))
}
# helpers ---------------
check_read <- function(z, path) {
if (inherits(z, "error")) {
warning(path, " malformed, could not read", call. = FALSE)
return(TRUE)
}
return(FALSE)
}
reader <- function(x, encoding, xml_extract_text) {
# switch reader based on file extension
switch(
sub("\\.", "", strextract(basename(x), "\\.[A-Za-z]+")),
pdf = pdf_reader(x),
xml = if (xml_extract_text) xml_reader(x, encoding) else txt_reader(x),
txt = txt_reader(x)
)
}
pdf_reader <- function(x) {
tp <- tryCatch(pdftools::pdf_text(x), error = function(e) e)
if (check_read(tp, x)) return(character(1))
txt <- paste0(tp, collapse = "\n")
Encoding(txt) <- "UTF-8"
return(txt)
}
xml_reader <- function(x, encoding) {
tp <- tryCatch(xml2::read_xml(x, encoding = encoding), error = function(e) e)
if (check_read(tp, x)) return(character(1))
xml2::xml_text(tp)
}
txt_reader <- function(x) {
tmp <- paste0(readLines(cn <- file(x), warn = FALSE), collapse = "\n")
close(cn)
return(tmp)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.