#' Extract text from html/xml node
#'
#' Internal helper function to extract text from html/xml node.
#' Not to be called directly (used by other functions in the package).
#'
#' @param html_doc html/xml document from \code{xml2::read_html}
#' @param xpaths character vector with xpath expressions
#' @param collapse boolean join list of characters by blank space
#' @return character (no result: "" when \code{collapse = TRUE} else \code{character(0)})
get_node_text <- function(html_doc, xpaths, collapse = F) {
if (is.null(xpaths)) {
return(if (collapse) "" else character(0))
}
text <- html_doc %>%
rvest::html_nodes(xpath = merge_xp(xpaths)) %>%
rvest::html_text2() %>%
stringi::stri_remove_empty_na()
if (collapse) text <- stringr::str_c(text, collapse = " ")
text
}
#' Parse date from string
#'
#' @param node_text character containing date in different formats
#' @return date-time object
parse_date <- function(node_text) {
months_german <- c(
"Januar",
"Februar",
"M\u00e4rz",
"April",
"Mai",
"Juni",
"Juli",
"August",
"September",
"Oktober",
"November",
"Dezember"
)
months_number <- c(
"1.", "2.", "3.", "4.", "5.", "6.",
"7.", "8.", "9.", "10.", "11.", "12."
)
temp_date <- node_text %>%
stringi::stri_replace_all_fixed(
pattern = months_german,
replacement = months_number,
vectorise_all = FALSE
) %>%
lubridate::parse_date_time(
orders = c(
"dmy", "ymd", "dmyHM", "YmdHMSz", "YmdHMS", "dBY", "mdY"
),
locale = Sys.getlocale("LC_TIME")
) %>%
format(format = "%Y-%m-%d %H:%M:%OS%z")
temp_date <- temp_date[!is.na(temp_date)][1]
if (length(temp_date) == 0) {
warning("Could not extract date.")
return(NA)
}
return(temp_date)
}
#' Parse blog article
#'
#' @param raw_html character with raw html
#' @param xpaths list with xpath expressions
parse_article <- function(raw_html, xpaths) {
html <- rvest::read_html(raw_html)
article <- list()
article$author <- get_node_text(
html,
xpaths$author,
collapse = T
)
article$title <- get_node_text(
html,
xpaths$title,
collapse = T
)
article$text <- get_node_text(
html,
xpaths$text
) %>% jsonlite::toJSON()
article$date <- get_node_text(
html,
xpaths$date
) %>% parse_date()
article$tags <- get_node_text(
html,
xpaths$tags
) %>% jsonlite::toJSON()
article$language <- textcat::textcat(
stringr::str_c(article$text, collapse = " ")
)
article$links <- get_node_text(
html,
xpaths$links
) %>% jsonlite::toJSON()
media_extensions <- stringr::regex(
"\\.(jpg|gif|tiff|png|bmp|jpeg|svg|webp|mp4|m4v|mov|avi|flv|ogv|mp3|aac|wma|wav|flac)",
ignore_case = T
)
article$media <- get_node_text(
html,
xpaths$images
) %>%
unique() %>%
stringr::str_subset(media_extensions) %>%
jsonlite::toJSON()
article
}
#' Print article content
#'
#' Prints article content to console for testing.
#' @param article_content list parsed article content
#' @return null
print_article_content <- function(article_content) {
message(paste("... title:", article_content$title))
message(paste("... author:", article_content$author))
message(paste("... date:", article_content$date))
message(paste("... tags:", article_content$tags))
message(paste("... links:", article_content$links))
message(paste("... images:", article_content$media))
message(paste(
"... text:",
stringr::str_sub(article_content$text, 1, 60),
"... (",
stringr::str_length(article_content$text),
"chars)"
))
}
#' Parse all articles
#'
#' @param blog_config list blog configuration
#' @param db_config list db connection configuration
#' @param run_test process only one article and do not committ changes to db
#' @return numeric number of processed entries
parse_all_articles <- function(blog_config, db_config, run_test = FALSE) {
# Connect to db and query unparsed articles
con_select <- do.call(
DBI::dbConnect,
c(RPostgres::Postgres(), db_config)
)
con_update <- do.call(
DBI::dbConnect,
c(RPostgres::Postgres(), db_config)
)
unparsed_articles <- db_find_unparsed_articles(con_select, blog_config$id)
# Make sure db con and results are cleared when function exits
on.exit({
DBI::dbClearResult(unparsed_articles)
DBI::dbDisconnect(con_select)
DBI::dbDisconnect(con_update)
})
# Set chunk size for fetching articles from server
chunk_size <- if (run_test) 1 else 500
processed_article_count <- 0
while (!DBI::dbHasCompleted(unparsed_articles)) {
# Fetch next chunk of articles
articles <- DBI::dbFetch(unparsed_articles, n = chunk_size)
for (i in seq_len(nrow(articles))) {
# Parse article content
article_content <- try(
{
raw_html <- articles$html[i]
parse_article(
raw_html,
blog_config$content_xpaths
)
},
silent = T
)
if (inherits(article_content, "error")) {
warning(paste0(
"Error parsing article ",
articles$id[i]
))
next
}
processed_article_count <- processed_article_count + 1
# If this is a test print result rather than insert to db
if (run_test) {
message(paste("... url: ", articles$url[i]))
print_article_content(article_content)
break
}
# Insert article info to database
tryCatch(
error = function(cnd) {
warning(paste0(
"Error inserting parsed article with id ",
articles$id[i],
" to db\n",
"(", conditionMessage(cnd), ")"
))
},
{
db_update_parsed_content(
con_update,
articles$id[i],
articles$source_id[i],
article_content
)
}
)
}
# If this is a test break while loop after one iteration
if (run_test) break
}
processed_article_count
}
#' Parse blog
#'
#' Parse data for specified blog
#' @param blog_config_file character path to blog config file (toml)
#' @param db_config_file character path to db config file (toml)
#' @param run_test boolean only parse one article without committing changes to db
#' @return numeric number of processed articles
#' @export
parse_blog_articles <- function(blog_config_file, db_config_file, run_test = F) {
print(paste("Parsing articles for", blog_config_file))
blog_config <- load_config(blog_config_file)
if (run_test) {
message(paste("...", blog_config$id, ": successfully parsed config"))
}
db_config <- RcppTOML::parseTOML(db_config_file)
parsed_article_count <- parse_all_articles(
blog_config,
db_config,
run_test = run_test
)
print(paste("...parsed", parsed_article_count, "articles"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.