#' Get urls for index pages
#'
#' get_index_urls(index_config) returns urls of index pages.
#' index pages are sitemaps or archive pages
#' that contain links to articles.
#'
#' @param index_config blog specific list of parameters
default_get_index_urls <- function(index_config) {
# set default page increment value if not provided
if (is.null(index_config$increase_by)) index_config$increase_by <- 1
if (is.null(index_config$start_at)) index_config$start_at <- 1
# If all article urls are listed on main page,
# index_config$base_url is the index page listing all articles.
if (index_config$type == "single-page") {
return(index_config$base_url)
}
# If blog is paginated and page number is part of the url,
# parse last page number from the first index page and
# return sequence of index urls from first page to last page.
if (index_config$type == "pagination") {
tryCatch(
{
if (is.null(index_config$number_of_pages)) {
page <- get_url(stringr::str_c(
index_config$base_url,
index_config$start_at
))
index_config$number_of_pages <- page %>%
xml2::read_html() %>%
rvest::html_elements(
xpath = merge_xp(index_config$pagination_xpath)
) %>%
as.character() %>%
stringr::str_extract("\\d+") %>%
as.integer() %>%
max(na.rm = TRUE)
}
index_urls <- stringr::str_c(
index_config$base_url,
seq(
from = index_config$start_at,
to = index_config$number_of_pages,
by = index_config$increase_by
)
)
return(index_urls)
},
error = function(e) {
message(
"Error getting last page number from ",
stringr::str_c(index_config$base_url, "1")
)
message(e)
return(NULL)
}
)
}
# If base url points to sitemap with sub-sitemaps, extract
# sitemap urls containing article urls based on xpath_pagination
if (index_config$type == "sitemaps") {
tryCatch(
{
page <- get_url(index_config$base_url)
index_urls <- page %>%
xml2::read_html() %>%
rvest::html_elements(xpath = merge_xp(index_config$sitemaps_xpath)) %>%
rvest::html_text() %>%
as.character() %>%
unique()
if (length(index_urls) == 0) index_urls <- NULL
if (isFALSE(index_config$reverse_order)) {
index_urls
} else {
rev(index_urls)
}
},
error = function(e) {
message(stringr::str_c(
"Error getting index urls from sitemap ",
index_config$base_url
))
message(e)
return(NULL)
}
)
}
}
#' Extract article links from index page
#'
#' get_article_urls loads an index page and extracts all links to articles
#' found on that page based on blog specific configuration.
#'
#' @param index_url character url of index page
#' @param index_config configuration for scraping and parsing of index pages
default_get_article_urls <- function(index_url, index_config) {
tryCatch(
{
page <- get_url(index_url)
article_urls <- page %>%
xml2::read_html() %>%
rvest::html_elements(
xpath = merge_xp(index_config$articles_xpath)
) %>%
rvest::html_text2() %>%
unique() %>%
xml2::url_absolute(index_config$articles_base_url)
if ("articles_subset" %in% names(index_config)) {
article_urls <- stringr::str_subset(
article_urls,
index_config$articles_subset
)
}
if ("articles_replace" %in% names(index_config)) {
article_urls <- stringr::str_replace_all(
article_urls,
index_config$articles_replace[1],
index_config$articles_replace[2]
)
}
article_urls %>% stringi::stri_remove_empty_na()
},
error = function(e) {
message("Error extracting article links from ", index_url)
message(e)
return(NULL)
}
)
}
#' Import html parsing functions from file
#'
#' \code{load_index_parsers(filename)} imports a set of parsing functions
#' from the specified R file. It checks whether all expected R file
#' @param src optional "default" or filename with custom parsing functions.
#' @return list of parsing functions
#' @export
load_index_parsers <- function(src = "default") {
expected_functions <- c(
"get_index_urls",
"get_article_urls"
)
default_parsers <- list(
get_index_urls = default_get_index_urls,
get_article_urls = default_get_article_urls
)
if (src == "default") {
return(default_parsers)
}
parser_env <- new.env()
source(src, local = parser_env)
parsers <- lapply(expected_functions, function(f) {
if (exists(f, where = parser_env)) {
return(get(f, envir = parser_env))
} else {
message(paste0(
f,
" not defined in ",
src, " - using default."
))
return(default_parsers[[f]])
}
})
return(parsers)
}
#' Find new article urls on index pages and insert into db
#'
#' \code{scrape_article_urls(blog_config, db_config)} searches index pages
#' of specified blog for article urls that are not yet present in the db,
#' and inserts those article urls into the db.
#'
#' @param blog_config list specifying with blog parameters
#' @param db_config list database connection parameters
#' @param run_test process only one index page and do not committ changes to db
#' @return NULL
#' @export
scrape_article_urls <- function(blog_config, db_config, run_test = F) {
index_config <- blog_config$index
# Import parsing functions (blog specific or generic default)
if ("custom_parsers" %in% names(index_config)) {
parsers <- load_index_parsers(index_config$custom_parsers)
} else {
parsers <- load_index_parsers()
}
# Disable ssl certificate verification for httr requests
httr::set_config(httr::config(ssl_verifypeer = 0L))
# Connect to database
con <- do.call(DBI::dbConnect, c(RPostgres::Postgres(), db_config))
on.exit(DBI::dbDisconnect(con))
# Get index pages (i.e. pages that contain article urls)
index_urls <- parsers$get_index_urls(index_config)
if (is.null(index_urls)) {
stop(paste0("No index urls found for ", blog_config$id))
}
# If run test only check first index page
if (run_test) {
index_urls <- index_urls[[1]]
message(paste(
"... looking for article urls on index page",
index_urls[[1]]
))
message(paste("... using", index_config$articles_xpath))
}
# Extract articles from all index pages
counter <- 0
for (index_url in index_urls) {
article_urls <- parsers$get_article_urls(
index_url,
index_config
)
if(run_test) {
message("... ", length(article_urls), " articles found.")
}
if (length(article_urls) == 0) {
warning("No articles found on ", index_url, ", stopping.")
break
}
# If all articles are already in db, break loop
# because nothing new is to be expected
# articles_in_db <- db_count_existing_urls(
# con,
# article_urls
# )
# new_articles <- length(article_urls) - articles_in_db
# if (new_articles == 0) break
#counter <- counter + new_articles
counter <- counter + length(article_urls)
db_insert_article_urls(con, blog_config$id, article_urls)
sleep <- if (is.null(blog_config$seconds_between_requests)) 1 else blog_config$seconds_between_requests
Sys.sleep(sleep)
}
print(paste("...", blog_config$id, ":", counter, "new article urls added."))
# disconnect database
return(counter)
}
#' Scrape article urls from blog
#'
#' Scrape new article urls from blog index pages
#' @param blog_config_file character path to blog config file (toml)
#' @param db_config_file character path to db config file (toml)
#' @param run_test boolean only parse one article without committing changes to db
#' @return numeric number of processed articles
#' @export
scrape_blog <- function(blog_config_file, db_config_file, run_test = F) {
if (run_test) {
message("")
message(paste("### Trying to scrape article urls for", blog_config_file))
}
blog_config <- load_config(blog_config_file)
if (run_test) {
message(paste(
"...", blog_config$id, ": successfully parsed config"
))
}
db_config <- RcppTOML::parseTOML(
db_config_file,
escape = F
)
result_count <- scrape_article_urls(
blog_config, db_config, run_test
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.