Nothing
#' Map XML Entries
#'
#' Extracts a dataframe from an xml document.
#'
#' @param doc An XML document
#' @param entries_xpath an xpath locator for all starting points
#' @param parts a list in the form column name = xpath locator
#' @param trim a list of columns that need to have whitespace trimmed
#' @param integers a list of columns that need to converted to integers
#' @param date_format how dates should be parsed
#'
#' @keywords internal
#'
#' @return A dataframe with one row per entry and columns from parts
map_xml <- function(doc,
entries_xpath,
parts,
trim = c(),
integers = c(),
date_format = "") {
xml2::xml_ns_strip(doc)
entries <- xml2::xml_find_all(doc, entries_xpath)
# TODO: a lot of optimization should be possible here...
res <- sapply(parts, function(path) {
sapply(entries, function(entry) {
node <- xml2::xml_find_first(entry, path)
if (typeof(node) == "character") {
return(node)
} else {
return(xml2::xml_text(node))
}
})
})
# TODO: this implies there is a bug in the preceeding maping that
# oversimplifies
if (length(entries) == 1) {
res <- t(res)
}
res <- data.frame(res, stringsAsFactors = FALSE)
for (col in trim) {
res[[col]] <- trimws(res[[col]])
}
for (col in integers) {
# To suppress warnings that don't matter, first convert spaces to NA
# from https://stackoverflow.com/questions/43734293/remove-non-breaking-space-character-in-string-in-r-on-linux
empties <- grepl("(*UCP)^\\s*$", res[[col]], perl = TRUE)
res[empties, col] <- NA
res[[col]] <- as.integer(res[[col]])
}
link_cols <- colnames(res)[grepl("href$", colnames(res))]
for (ref in link_cols) {
res[[ref]] <- ifelse(startsWith(res[[ref]], "javascript"),
regmatches(res[[ref]],
regexpr("http[^('|\")]+", res[[ref]])),
res[[ref]])
res[[ref]] <- ifelse(is.na(res[[ref]]), NA,
xml2::url_absolute(res[[ref]], xml2::xml_url(doc)))
# We need to do this because the rss gives http url's - this saves the
# redirect
res[[ref]] <- gsub("http:", "https:", res[[ref]])
}
date_cols <- colnames(res)[grepl("date$", colnames(res))]
for (ref in date_cols) {
res[[ref]] <- if (date_format == "") {
as.POSIXct(res[[ref]])
} else {
as.POSIXct(res[[ref]], format = date_format)
}
}
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.