#' Check if download was redirected
#'
#' @param response httr response object
was_redirected <- function(response) {
status <- vapply(
response$all_headers, `[[`, integer(1), "status"
)
any(status >= 300 & status < 400)
}
#' Download article
#'
#' @param url character article url
#' @return list $status: status information, $html: response body
download_article <- function(url) {
tryCatch(
{
response <- httr::RETRY(
"GET",
url,
#httr::config(followlocation = 0),
httr::user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Firefox/88.0"),
times = 3,
pause_base = 1,
pause_cap = 30,
terminate_on = c(301, 400, 404, 410, 500)
)
status <- if (was_redirected(response)) {
"300"
} else {
as.character(httr::status_code(response))
}
response_content <- httr::content(response, as = "text")
html <- if (status %in% c("200", "300") & stringr::str_length(response_content) > 4) {
response_content
} else {
NA
}
list(
status = status,
html = html
)
},
error = function(e) {
message("Error downloading ", url)
message(e)
return(list(
status = "Unknown error",
html = NA
))
}
)
}
#' Download all articles
#'
#' @param blog_config list blog configuration
#' @param db_config list db connection configuration
#' @return numeric number of processed entries
download_articles <- function(blog_config, db_config) {
sleep_time <- if (is.null(blog_config$seconds_between_requests)) {
1
} else {
blog_config$seconds_between_requests
}
# Connect to db and query unparsed articles
con_select <- do.call(
DBI::dbConnect,
c(RPostgres::Postgres(), db_config)
)
con_update <- do.call(
DBI::dbConnect,
c(RPostgres::Postgres(), db_config)
)
new_articles <- db_find_new_urls(con_select, blog_config$id)
# Make sure db con and results are cleared when function exits
on.exit({
DBI::dbClearResult(new_articles)
DBI::dbDisconnect(con_select)
DBI::dbDisconnect(con_update)
})
# Set chunk size for fetching articles from server
chunk_size <- 500
processed_article_count <- 0
while (!DBI::dbHasCompleted(new_articles)) {
# Fetch next chunk of articles
articles <- DBI::dbFetch(new_articles, n = chunk_size)
for (i in seq_len(nrow(articles))) {
url <- articles$url[i]
response <- download_article(url)
processed_article_count <- processed_article_count + 1
# Insert article info to database
tryCatch(
error = function(cnd) {
stop(
"Error inserting parsed article with id ",
articles$id[i],
" to db\n",
"(", conditionMessage(cnd), ")"
)
},
{
db_update_download_status(
con_update,
articles$id[i],
response$status,
response$html
)
}
)
Sys.sleep(sleep_time + runif(1, min = -1, max = 1))
}
# If this is a test break while loop after one iteration
}
processed_article_count
}
#' Download blog articles
#'
#' Download all new articles for blog
#' @param blog_config_file character path to blog config file (toml)
#' @param db_config_file character path to db config file (toml)
#' @return numeric number of downloaded articles
#' @export
download_blog <- function(blog_config_file, db_config_file) {
print(paste("Downloading articles for", blog_config_file))
blog_config <- load_config(blog_config_file)
db_config <- RcppTOML::parseTOML(db_config_file)
downloaded_article_count <- download_articles(
blog_config,
db_config
)
print(
paste(
"...", blog_config$id,
": ", downloaded_article_count, "articles downloaded."
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.