R/utils.R

Defines functions merge_xp load_config get_url

Documented in get_url load_config merge_xp

#' HTTPrequest
#'
#' \code{get_url(page_url)} makes an http request to target url and retries
#' if page is unavailable for some reason.
#'
#' @param page_url target url (string)
#' @return character with response body if successful, else NULL
get_url <- function(page_url) {
    tryCatch(
        {
            response <- httr::RETRY(
                "GET",
                page_url,
                httr::user_agent("Mozilla/5.0"),
                pause_base = 2,
                pause_cap = 60
            )
            httr::content(response, as = 'text')
        },
        error = function(e) {
            message("Error getting ", page_url)
            message(e)
            return(NULL)
        }
    )
}


#' Load blog configuration from file
#'
#' @param config_file character filename of config file
#' @return list blog configuration
load_config <- function(config_file) {
    config <- RcppTOML::parseTOML(config_file, escape = F)

    errors <- list()

    # Check if all sections are present in config
    expected_sections <- c("index", "content_xpaths")
    missing_sections <- expected_sections[!(expected_sections %in% names(config))]
    if (length(missing_sections) > 0) {
        stop(paste(
            "Sections",
            paste(missing_sections, collapse = ", "),
            "missing in",
            config_file
        ))
    }

    # Check if valid index type is defined
    if (is.null(config$index$type)) {
        append(errors, paste("index$type missing in", config_file))
    } else if (!(config$index$type %in% c("sitemaps", "pagination", "single-page"))) {
        errors <- append(
            errors,
            paste("Invalid index$type", config$index$type)
        )
    }

    # Check if required index config values are defined
    expected_index_values <- c(
        "base_url", "articles_xpath", "articles_base_url"
    )
    if (config$index$type == "sitemaps") {
        expected_index_values <- append(expected_index_values, "sitemaps_xpath")
    }
    missing_index_values <- expected_index_values[!(expected_index_values %in% names(config$index))]
    if (length(missing_index_values) > 0) {
        errors <- append(errors, paste(
            paste0("index$", missing_index_values),
            "missing"
        ))
    }
    if (config$index$type == "pagination") {
        if (!any(c("pagination_xpath", "number_of_pages") %in% names(config$index))) {
            errors <- append(errors, paste(
            paste0("index$pagination OR index$number_of_pages missing")
        ))
        }
    }

    # Check if all content xpaths are defined
    expected_content_xpaths <- c(
        "title",
        "author",
        "date",
        "text",
        "tags",
        "links",
        "images"
    )
    missing_content_xpaths <- expected_content_xpaths[
        !(expected_content_xpaths %in% names(config$content_xpaths))
    ]
    if (length(missing_content_xpaths) > 0) {
        message(paste(
            config_file, ":",
            "no values defined for",
            paste0("content_xpaths$",
                missing_content_xpaths,
                collapse = ", "
            )
        ))
    }

    if (length(errors) > 0) {
        for (e in errors) {
            warning(e)
        }
        stop(paste("Missing values in config file", config_file))
    }

    config
}


#' Merge xpath expressions
#'
#' @param xp character vector of xpath expressions
#' @return character of xpath expressions merged by "|"
merge_xp <- function(xp) {
    stringr::str_c(xp, collapse = "|")
}
digital-geopolitics/dgblogs documentation built on March 22, 2022, 6:40 p.m.