R/scrape-blog.R

Defines functions scrape_blog scrape_article_urls load_index_parsers default_get_article_urls default_get_index_urls

Documented in default_get_article_urls default_get_index_urls load_index_parsers scrape_article_urls scrape_blog

#' Get urls for index pages
#'
#' get_index_urls(index_config) returns urls of index pages.
#' index pages are sitemaps or archive pages
#' that contain links to articles.
#'
#' @param index_config blog specific list of parameters
default_get_index_urls <- function(index_config) {

    # set default page increment value if not provided
    if (is.null(index_config$increase_by)) index_config$increase_by <- 1
    if (is.null(index_config$start_at)) index_config$start_at <- 1

    # If all article urls are listed on main page,
    # index_config$base_url is the index page listing all articles.
    if (index_config$type == "single-page") {
        return(index_config$base_url)
    }

    # If blog is paginated and page number is part of the url,
    # parse last page number from the first index page and
    # return sequence of index urls from first page to last page.
    if (index_config$type == "pagination") {
        tryCatch(
            {
                if (is.null(index_config$number_of_pages)) {
                    page <- get_url(stringr::str_c(
                        index_config$base_url,
                        index_config$start_at
                    ))
                    index_config$number_of_pages <- page %>%
                        xml2::read_html() %>%
                        rvest::html_elements(
                            xpath = merge_xp(index_config$pagination_xpath)
                        ) %>%
                        as.character() %>%
                        stringr::str_extract("\\d+") %>%
                        as.integer() %>%
                        max(na.rm = TRUE)
                }
                index_urls <- stringr::str_c(
                    index_config$base_url,
                    seq(
                        from = index_config$start_at,
                        to = index_config$number_of_pages,
                        by = index_config$increase_by
                    )
                )
                return(index_urls)
            },
            error = function(e) {
                message(
                    "Error getting last page number from ",
                    stringr::str_c(index_config$base_url, "1")
                )
                message(e)
                return(NULL)
            }
        )
    }

    # If base url points to sitemap with sub-sitemaps, extract
    # sitemap urls containing article urls based on xpath_pagination
    if (index_config$type == "sitemaps") {
        tryCatch(
            {
                page <- get_url(index_config$base_url)
                index_urls <- page %>%
                    xml2::read_html() %>%
                    rvest::html_elements(xpath = merge_xp(index_config$sitemaps_xpath)) %>%
                    rvest::html_text() %>%
                    as.character() %>%
                    unique()
                if (length(index_urls) == 0) index_urls <- NULL
                if (isFALSE(index_config$reverse_order)) {
                    index_urls
                } else {
                    rev(index_urls)
                }
            },
            error = function(e) {
                message(stringr::str_c(
                    "Error getting index urls from sitemap ",
                    index_config$base_url
                ))
                message(e)
                return(NULL)
            }
        )
    }
}



#' Extract article links from index page
#'
#' get_article_urls loads an index page and extracts all links to articles
#' found on that page based on blog specific configuration.
#'
#' @param index_url character url of index page
#' @param index_config configuration for scraping and parsing of index pages
default_get_article_urls <- function(index_url, index_config) {
    tryCatch(
        {
            page <- get_url(index_url)

            article_urls <- page %>%
                xml2::read_html() %>%
                rvest::html_elements(
                    xpath = merge_xp(index_config$articles_xpath)
                ) %>%
                rvest::html_text2() %>%
                unique() %>%
                xml2::url_absolute(index_config$articles_base_url)

            if ("articles_subset" %in% names(index_config)) {
                article_urls <- stringr::str_subset(
                    article_urls,
                    index_config$articles_subset
                )
            }

            if ("articles_replace" %in% names(index_config)) {
                article_urls <- stringr::str_replace_all(
                    article_urls,
                    index_config$articles_replace[1],
                    index_config$articles_replace[2]
                )
            }
            article_urls %>% stringi::stri_remove_empty_na()
        },
        error = function(e) {
            message("Error extracting article links from ", index_url)
            message(e)
            return(NULL)
        }
    )
}



#' Import html parsing functions from file
#'
#' \code{load_index_parsers(filename)} imports a set of parsing functions
#' from the specified R file. It checks whether all expected R file
#' @param src optional "default" or filename with custom parsing functions.
#' @return list of parsing functions
#' @export
load_index_parsers <- function(src = "default") {
    expected_functions <- c(
        "get_index_urls",
        "get_article_urls"
    )

    default_parsers <- list(
        get_index_urls = default_get_index_urls,
        get_article_urls = default_get_article_urls
    )

    if (src == "default") {
        return(default_parsers)
    }

    parser_env <- new.env()
    source(src, local = parser_env)

    parsers <- lapply(expected_functions, function(f) {
        if (exists(f, where = parser_env)) {
            return(get(f, envir = parser_env))
        } else {
            message(paste0(
                f,
                " not defined in ",
                src, " - using default."
            ))
            return(default_parsers[[f]])
        }
    })
    return(parsers)
}



#' Find new article urls on index pages and insert into db
#'
#' \code{scrape_article_urls(blog_config, db_config)} searches index pages
#' of specified blog for article urls that are not yet present in the db,
#' and inserts those article urls into the db.
#'
#' @param blog_config list specifying with blog parameters
#' @param db_config list database connection parameters
#' @param run_test process only one index page and do not committ changes to db
#' @return NULL
#' @export
scrape_article_urls <- function(blog_config, db_config, run_test = F) {
    index_config <- blog_config$index

    # Import parsing functions (blog specific or generic default)
    if ("custom_parsers" %in% names(index_config)) {
        parsers <- load_index_parsers(index_config$custom_parsers)
    } else {
        parsers <- load_index_parsers()
    }

    # Disable ssl certificate verification for httr requests
    httr::set_config(httr::config(ssl_verifypeer = 0L))

    # Connect to database
    con <- do.call(DBI::dbConnect, c(RPostgres::Postgres(), db_config))
    on.exit(DBI::dbDisconnect(con))

    # Get index pages (i.e. pages that contain article urls)
    index_urls <- parsers$get_index_urls(index_config)
    if (is.null(index_urls)) {
        stop(paste0("No index urls found for ", blog_config$id))
    }

    # If run test only check first index page
    if (run_test) {
        index_urls <- index_urls[[1]]
        message(paste(
            "... looking for article urls on index page",
            index_urls[[1]]
        ))
        message(paste("... using", index_config$articles_xpath))
    }

    # Extract articles from all index pages
    counter <- 0
    for (index_url in index_urls) {
        article_urls <- parsers$get_article_urls(
            index_url,
            index_config
        )

        if(run_test) {
            message("... ", length(article_urls), " articles found.")
        }

        if (length(article_urls) == 0) {
            warning("No articles found on ", index_url, ", stopping.")
            break
        }

        # If all articles are already in db, break loop
        # because nothing new is to be expected
        # articles_in_db <- db_count_existing_urls(
        #     con,
        #     article_urls
        # )
        # new_articles <- length(article_urls) - articles_in_db
        # if (new_articles == 0) break

        #counter <- counter + new_articles
        counter <- counter + length(article_urls)

        db_insert_article_urls(con, blog_config$id, article_urls)
        sleep <- if (is.null(blog_config$seconds_between_requests)) 1 else blog_config$seconds_between_requests
        Sys.sleep(sleep)
    }
    print(paste("...", blog_config$id, ":", counter, "new article urls added."))
    # disconnect database
    return(counter)
}



#' Scrape article urls from blog
#'
#' Scrape new article urls from blog index pages
#' @param blog_config_file character path to blog config file (toml)
#' @param db_config_file character path to db config file (toml)
#' @param run_test boolean only parse one article without committing changes to db
#' @return numeric number of processed articles
#' @export
scrape_blog <- function(blog_config_file, db_config_file, run_test = F) {
    if (run_test) {
        message("")
        message(paste("### Trying to scrape article urls for", blog_config_file))
    }
    blog_config <- load_config(blog_config_file)

    if (run_test) {
        message(paste(
            "...", blog_config$id, ": successfully parsed config"
        ))
    }
    db_config <- RcppTOML::parseTOML(
        db_config_file,
        escape = F
    )
    result_count <- scrape_article_urls(
        blog_config, db_config, run_test
    )
}
digital-geopolitics/dgblogs documentation built on March 22, 2022, 6:40 p.m.