R/download-articles.R

Defines functions download_blog download_articles download_article was_redirected

Documented in download_article download_articles download_blog was_redirected

#' Check if download was redirected
#'
#' @param response httr response object
was_redirected <- function(response) {
    status <- vapply(
        response$all_headers, `[[`, integer(1), "status"
    )
    any(status >= 300 & status < 400)
}

#' Download article
#'
#' @param url character article url
#' @return list $status: status information, $html: response body
download_article <- function(url) {
    tryCatch(
        {
            response <- httr::RETRY(
                "GET",
                url,
                #httr::config(followlocation = 0),
                httr::user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Firefox/88.0"),
                times = 3,
                pause_base = 1,
                pause_cap = 30,
                terminate_on = c(301, 400, 404, 410, 500)
            )

            status <- if (was_redirected(response)) {
                "300"
            } else {
                as.character(httr::status_code(response))
            }

            response_content <- httr::content(response, as = "text")
            html <- if (status %in% c("200", "300") & stringr::str_length(response_content) > 4) {
                response_content
            } else {
                NA
            }

            list(
                status = status,
                html = html
            )
        },
        error = function(e) {
            message("Error downloading ", url)
            message(e)
            return(list(
                status = "Unknown error",
                html = NA
            ))
        }
    )
}

#' Download all articles
#'
#' @param blog_config list blog configuration
#' @param db_config list db connection configuration
#' @return numeric number of processed entries
download_articles <- function(blog_config, db_config) {
    sleep_time <- if (is.null(blog_config$seconds_between_requests)) {
        1
    } else {
        blog_config$seconds_between_requests
    }


    # Connect to db and query unparsed articles
    con_select <- do.call(
        DBI::dbConnect,
        c(RPostgres::Postgres(), db_config)
    )
    con_update <- do.call(
        DBI::dbConnect,
        c(RPostgres::Postgres(), db_config)
    )
    new_articles <- db_find_new_urls(con_select, blog_config$id)

    # Make sure db con and results are cleared when function exits
    on.exit({
        DBI::dbClearResult(new_articles)
        DBI::dbDisconnect(con_select)
        DBI::dbDisconnect(con_update)
    })

    # Set chunk size for fetching articles from server
    chunk_size <- 500
    processed_article_count <- 0

    while (!DBI::dbHasCompleted(new_articles)) {

        # Fetch next chunk of articles
        articles <- DBI::dbFetch(new_articles, n = chunk_size)

        for (i in seq_len(nrow(articles))) {
            url <- articles$url[i]
            response <- download_article(url)
            processed_article_count <- processed_article_count + 1

            # Insert article info to database
            tryCatch(
                error = function(cnd) {
                    stop(
                        "Error inserting parsed article with id ",
                        articles$id[i],
                        " to db\n",
                        "(", conditionMessage(cnd), ")"
                    )
                },
                {
                    db_update_download_status(
                        con_update,
                        articles$id[i],
                        response$status,
                        response$html
                    )
                }
            )
            Sys.sleep(sleep_time + runif(1, min = -1, max = 1))
        }
        # If this is a test break while loop after one iteration
    }
    processed_article_count
}


#' Download blog articles
#'
#' Download all new articles for blog
#' @param blog_config_file character path to blog config file (toml)
#' @param db_config_file character path to db config file (toml)
#' @return numeric number of downloaded articles
#' @export
download_blog <- function(blog_config_file, db_config_file) {
    print(paste("Downloading articles for", blog_config_file))

    blog_config <- load_config(blog_config_file)
    db_config <- RcppTOML::parseTOML(db_config_file)

    downloaded_article_count <- download_articles(
        blog_config,
        db_config
    )

    print(
        paste(
            "...", blog_config$id,
            ": ", downloaded_article_count, "articles downloaded."
        )
    )
}
digital-geopolitics/dgblogs documentation built on March 22, 2022, 6:40 p.m.