dgblogs: Scrape text from blogs

Documented in get_node_text parse_all_articles parse_article parse_blog_articles parse_date print_article_content

#' Extract text from html/xml node
#'
#' Internal helper function to extract text from html/xml node.
#' Not to be called directly (used by other functions in the package).
#'
#' @param html_doc html/xml document from \code{xml2::read_html}
#' @param xpaths character vector with xpath expressions
#' @param collapse boolean join list of characters by blank space
#' @return character (no result: "" when \code{collapse = TRUE} else \code{character(0)})
get_node_text <- function(html_doc, xpaths, collapse = F) {
    if (is.null(xpaths)) {
        return(if (collapse) "" else character(0))
    }

    text <- html_doc %>%
        rvest::html_nodes(xpath = merge_xp(xpaths)) %>%
        rvest::html_text2() %>%
        stringi::stri_remove_empty_na()

    if (collapse) text <- stringr::str_c(text, collapse = " ")
    text
}



#' Parse date from string
#'
#' @param node_text character containing date in different formats
#' @return date-time object
parse_date <- function(node_text) {
    months_german <- c(
        "Januar",
        "Februar",
        "M\u00e4rz",
        "April",
        "Mai",
        "Juni",
        "Juli",
        "August",
        "September",
        "Oktober",
        "November",
        "Dezember"
    )
    months_number <- c(
        "1.", "2.", "3.", "4.", "5.", "6.",
        "7.", "8.", "9.", "10.", "11.", "12."
    )
    temp_date <- node_text %>%
        stringi::stri_replace_all_fixed(
            pattern = months_german,
            replacement = months_number,
            vectorise_all = FALSE
        ) %>%
        lubridate::parse_date_time(
            orders = c(
                "dmy", "ymd", "dmyHM", "YmdHMSz", "YmdHMS", "dBY", "mdY"
            ),
            locale = Sys.getlocale("LC_TIME")
        ) %>%
        format(format = "%Y-%m-%d %H:%M:%OS%z")

    temp_date <- temp_date[!is.na(temp_date)][1]

    if (length(temp_date) == 0) {
        warning("Could not extract date.")
        return(NA)
    }
    return(temp_date)
}



#' Parse blog article
#'
#' @param raw_html character with raw html
#' @param xpaths list with xpath expressions
parse_article <- function(raw_html, xpaths) {
    html <- rvest::read_html(raw_html)

    article <- list()

    article$author <- get_node_text(
        html,
        xpaths$author,
        collapse = T
    )

    article$title <- get_node_text(
        html,
        xpaths$title,
        collapse = T
    )

    article$text <- get_node_text(
        html,
        xpaths$text
    ) %>% jsonlite::toJSON()

    article$date <- get_node_text(
        html,
        xpaths$date
    ) %>% parse_date()

    article$tags <- get_node_text(
        html,
        xpaths$tags
    ) %>% jsonlite::toJSON()

    article$language <- textcat::textcat(
        stringr::str_c(article$text, collapse = " ")
    )

    article$links <- get_node_text(
        html,
        xpaths$links
    ) %>% jsonlite::toJSON()

    media_extensions <- stringr::regex(
        "\\.(jpg|gif|tiff|png|bmp|jpeg|svg|webp|mp4|m4v|mov|avi|flv|ogv|mp3|aac|wma|wav|flac)",
        ignore_case = T
    )

    article$media <- get_node_text(
        html,
        xpaths$images
    ) %>%
        unique() %>%
        stringr::str_subset(media_extensions) %>%
        jsonlite::toJSON()

    article
}



#' Print article content
#'
#' Prints article content to console for testing.
#' @param article_content list parsed article content
#' @return null
print_article_content <- function(article_content) {
    message(paste("... title:", article_content$title))
    message(paste("... author:", article_content$author))
    message(paste("... date:", article_content$date))
    message(paste("... tags:", article_content$tags))
    message(paste("... links:", article_content$links))
    message(paste("... images:", article_content$media))

    message(paste(
        "... text:",
        stringr::str_sub(article_content$text, 1, 60),
        "... (",
        stringr::str_length(article_content$text),
        "chars)"
    ))
}



#' Parse all articles
#'
#' @param blog_config list blog configuration
#' @param db_config list db connection configuration
#' @param run_test process only one article and do not committ changes to db
#' @return numeric number of processed entries
parse_all_articles <- function(blog_config, db_config, run_test = FALSE) {

    # Connect to db and query unparsed articles
    con_select <- do.call(
        DBI::dbConnect,
        c(RPostgres::Postgres(), db_config)
    )
    con_update <- do.call(
        DBI::dbConnect,
        c(RPostgres::Postgres(), db_config)
    )
    unparsed_articles <- db_find_unparsed_articles(con_select, blog_config$id)

    # Make sure db con and results are cleared when function exits
    on.exit({
        DBI::dbClearResult(unparsed_articles)
        DBI::dbDisconnect(con_select)
        DBI::dbDisconnect(con_update)
    })

    # Set chunk size for fetching articles from server
    chunk_size <- if (run_test) 1 else 500
    processed_article_count <- 0

    while (!DBI::dbHasCompleted(unparsed_articles)) {

        # Fetch next chunk of articles
        articles <- DBI::dbFetch(unparsed_articles, n = chunk_size)

        for (i in seq_len(nrow(articles))) {

            # Parse article content
            article_content <- try(
                {
                    raw_html <- articles$html[i]
                    parse_article(
                        raw_html,
                        blog_config$content_xpaths
                    )
                },
                silent = T
            )
            if (inherits(article_content, "error")) {
                warning(paste0(
                    "Error parsing article ",
                    articles$id[i]
                ))
                next
            }
            processed_article_count <- processed_article_count + 1

            # If this is a test print result rather than insert to db
            if (run_test) {
                message(paste("... url: ", articles$url[i]))
                print_article_content(article_content)
                break
            }

            # Insert article info to database
            tryCatch(
                error = function(cnd) {
                    warning(paste0(
                        "Error inserting parsed article with id ",
                        articles$id[i],
                        " to db\n",
                        "(", conditionMessage(cnd), ")"
                    ))
                },
                {
                    db_update_parsed_content(
                        con_update,
                        articles$id[i],
                        articles$source_id[i],
                        article_content
                    )
                }
            )
        }
        # If this is a test break while loop after one iteration
        if (run_test) break
    }
    processed_article_count
}



#' Parse blog
#'
#' Parse data for specified blog
#' @param blog_config_file character path to blog config file (toml)
#' @param db_config_file character path to db config file (toml)
#' @param run_test boolean only parse one article without committing changes to db
#' @return numeric number of processed articles
#' @export
parse_blog_articles <- function(blog_config_file, db_config_file, run_test = F) {
    print(paste("Parsing articles for", blog_config_file))

    blog_config <- load_config(blog_config_file)

    if (run_test) {
        message(paste("...", blog_config$id, ": successfully parsed config"))
    }

    db_config <- RcppTOML::parseTOML(db_config_file)

    parsed_article_count <- parse_all_articles(
        blog_config,
        db_config,
        run_test = run_test
    )

    print(paste("...parsed", parsed_article_count, "articles"))
}

digital-geopolitics/dgblogs documentation built on March 22, 2022, 6:40 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

digital-geopolitics/dgblogs
Scrape text from blogs

R/parse-articles.R
In digital-geopolitics/dgblogs: Scrape text from blogs

Defines functions parse_blog_articles parse_all_articles print_article_content parse_article parse_date get_node_text

Documented in get_node_text parse_all_articles parse_article parse_blog_articles parse_date print_article_content

R Package Documentation

Browse R Packages

We want your feedback!

digital-geopolitics/dgblogs Scrape text from blogs

R/parse-articles.R In digital-geopolitics/dgblogs: Scrape text from blogs

Defines functions parse_blog_articles parse_all_articles print_article_content parse_article parse_date get_node_text

Documented in get_node_text parse_all_articles parse_article parse_blog_articles parse_date print_article_content

R Package Documentation

Browse R Packages

We want your feedback!

digital-geopolitics/dgblogs
Scrape text from blogs

R/parse-articles.R
In digital-geopolitics/dgblogs: Scrape text from blogs