#' Read in .pptx Content
#'
#' Read in the content from a .pptx file.
#'
#' @param file The path to the .pptx file.
#' @param skip The number of lines to skip.
#' @param remove.empty logical. If `TRUE` empty elements in the vector are
#' removed.
#' @param trim logical. If `TRUE` the leading/training white space is
#' removed.
#' @param include.notes logical. If `TRUE` then slide notes are included.
#' @param ... ignored.
#' @return Returns a [base::data.frame()] with the slide number
#' (`slide_id`), line number (`element_id`), and the `text`.
#' @keywords pptx
#' @export
#' @examples
#' \dontrun{
#' url <- file.path("https://www.oclc.org/content/dam/research/presentations",
#' "2019/111319-godby-NISO-What-Are-Entities-Matter.pptx")
#' file <- download(url)
#' (txt <- read_pptx(file))
#'
#' pptx_doc <- system.file('docs/Hello_World.pptx', package = "textreadr")
#' read_pptx(pptx_doc)
#' read_pptx(pptx_doc, include.notes = TRUE)
#' }
read_pptx <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, include.notes = FALSE, ...) {
filetype <- tools::file_ext(file)
if (filetype %in% c('pptx') && grepl('^([fh]ttp)', file)){
file <- download(file)
}
## create temp dir
tmp <- tempfile()
if (!dir.create(tmp)) stop("Temporary directory could not be established.")
## clean up
on.exit(unlink(tmp, recursive=TRUE))
## unzip docx
utils::unzip(file, exdir = tmp)
slides <- extract_pptx_nodes(
tmp,
trim = trim,
remove.empty = remove.empty,
skip = skip
)
if (include.notes){
notes <- extract_pptx_nodes(
tmp,
dir = 'notesSlides',
tag1 = '//p:txBody//a:p',
starts = lapply(slides, nrow),
trim = trim,
remove.empty = remove.empty,
skip = 0
)
slides <- lapply(names(slides), function(i){
rbind(
slides[[i]],
notes[[i]],
stringsAsFactors = FALSE
)
})
}
textshape::tidy_list(slides, 'slide_id')
}
extract_pptx_nodes <- function(loc, dir = 'slides', tag1 = '//a:p', tag2 = 'a:t', starts = NULL, remove.empty, trim, skip){
xmlfiles <- dir(file.path(loc, "ppt", dir), pattern = '[.]xml$', full.names = TRUE)
## read in the unzipped docx
docs <- lapply(xmlfiles, xml2::read_xml)
if (is.null(starts)) starts <- as.list(rep(0, length(docs)))
out <- Map(function(doc){
if (dir == 'slides'){
## extract the content
children <- lapply(xml2::xml_find_all(doc, tag1), xml2::xml_children)
pvalues <- unlist(lapply(children, function(x) {
paste(xml2::xml_text(xml2::xml_find_all(x, tag2)), collapse = ' ')
}))
} else {
pvalues <- lapply(xml2::xml_find_all(doc, tag1), xml2::xml_text)
}
## formatting
if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
if (skip > 0) pvalues <- pvalues[-seq(skip)]
if (isTRUE(trim)) pvalues <- trimws(pvalues)
data.frame(element_id = seq_along(pvalues), text = pvalues, stringsAsFactors = FALSE)
}, docs)
if (dir == 'slides'){
names(out) <- gsub('^(notes)?[Ss]lide|\\.xml$', '', basename(xmlfiles))
} else {
names(out) <- unlist(lapply(out, function(x) x$text[nrow(x)]))
lens <- unlist(starts[as.integer(names(out))])
out[] <- Map(function(x, len) {
x$element_id <- x$element_id + len
x[1:(nrow(x) - 1),]
}, out, lens)
}
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.