#' HTTPrequest
#'
#' \code{get_url(page_url)} makes an http request to target url and retries
#' if page is unavailable for some reason.
#'
#' @param page_url target url (string)
#' @return character with response body if successful, else NULL
get_url <- function(page_url) {
tryCatch(
{
response <- httr::RETRY(
"GET",
page_url,
httr::user_agent("Mozilla/5.0"),
pause_base = 2,
pause_cap = 60
)
httr::content(response, as = 'text')
},
error = function(e) {
message("Error getting ", page_url)
message(e)
return(NULL)
}
)
}
#' Load blog configuration from file
#'
#' @param config_file character filename of config file
#' @return list blog configuration
load_config <- function(config_file) {
config <- RcppTOML::parseTOML(config_file, escape = F)
errors <- list()
# Check if all sections are present in config
expected_sections <- c("index", "content_xpaths")
missing_sections <- expected_sections[!(expected_sections %in% names(config))]
if (length(missing_sections) > 0) {
stop(paste(
"Sections",
paste(missing_sections, collapse = ", "),
"missing in",
config_file
))
}
# Check if valid index type is defined
if (is.null(config$index$type)) {
append(errors, paste("index$type missing in", config_file))
} else if (!(config$index$type %in% c("sitemaps", "pagination", "single-page"))) {
errors <- append(
errors,
paste("Invalid index$type", config$index$type)
)
}
# Check if required index config values are defined
expected_index_values <- c(
"base_url", "articles_xpath", "articles_base_url"
)
if (config$index$type == "sitemaps") {
expected_index_values <- append(expected_index_values, "sitemaps_xpath")
}
missing_index_values <- expected_index_values[!(expected_index_values %in% names(config$index))]
if (length(missing_index_values) > 0) {
errors <- append(errors, paste(
paste0("index$", missing_index_values),
"missing"
))
}
if (config$index$type == "pagination") {
if (!any(c("pagination_xpath", "number_of_pages") %in% names(config$index))) {
errors <- append(errors, paste(
paste0("index$pagination OR index$number_of_pages missing")
))
}
}
# Check if all content xpaths are defined
expected_content_xpaths <- c(
"title",
"author",
"date",
"text",
"tags",
"links",
"images"
)
missing_content_xpaths <- expected_content_xpaths[
!(expected_content_xpaths %in% names(config$content_xpaths))
]
if (length(missing_content_xpaths) > 0) {
message(paste(
config_file, ":",
"no values defined for",
paste0("content_xpaths$",
missing_content_xpaths,
collapse = ", "
)
))
}
if (length(errors) > 0) {
for (e in errors) {
warning(e)
}
stop(paste("Missing values in config file", config_file))
}
config
}
#' Merge xpath expressions
#'
#' @param xp character vector of xpath expressions
#' @return character of xpath expressions merged by "|"
merge_xp <- function(xp) {
stringr::str_c(xp, collapse = "|")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.