webr: Fonctions to crawl web pages

Documented in crawl_urls download_file download_url

#' Crawl URLs in a data frame.
#'
#' @param urls URLs to crawl.
#' @param parallel If \code{TRUE} then URLs are crawled using multiple CPU cores  (using parallel and pbapply packages).
#' @param progress_bar If \code{TRUE} then a progress bar is displayed (using pbapply package).
#' @param message If \code{TRUE} then a message indicates how URLs are crawled.
#' @param ... Optional arguments from \code{webr::download_url}
#'
#' @return A data frame with one URL per row.\cr
#'
#' This data frame contains the following colmuns :\cr
#' - \code{url} : initial URL\cr
#' - \code{crawl} : adresse nettoyée\cr
#' - \code{url_response} : returned URL\cr
#' - \code{date_crawl} : download datetime\cr
#' - \code{status_code} : status code return by HTTP request\cr
#' - \code{error} : error message
#'
#' @export
crawl_urls <- function(urls, parallel = FALSE, progress_bar = FALSE, message = FALSE, ...) {

  if (length(urls) == 0) {
    stop("There is no new urls to crawl", call. = FALSE)
  }

  crawl <- dplyr::tibble(url = urls)

  if (message == TRUE) {
    message(nrow(crawl)," url(s) are crawled")
  }

  if (progress_bar == TRUE & !"pbapply" %in% utils::installed.packages()[, 1]) {
    stop("pbapply package needs to be installed", call. = FALSE)
  }

  if (progress_bar == TRUE) {
    pbapply::pboptions(type = "timer")
  }

  if (parallel == TRUE & !all(c("parallel", "pbapply") %in% utils::installed.packages()[, 1])) {
    stop("parallel and pbapply packages need to be installed", call. = FALSE)
  }

  if (parallel == TRUE) {
    cluster <- parallel::makeCluster(parallel::detectCores())

  } else {
    cluster <- NULL

  }

  if (progress_bar == TRUE | parallel == TRUE) {

    crawl$crawl <- crawl$url %>%
      pbapply::pblapply(purrr::safely(webr::download_url, otherwise = NA), ..., cl = cluster)

  } else {

    crawl$crawl <- crawl$url %>%
      lapply(purrr::safely(webr::download_url, otherwise = NA), ...)

  }

  if (parallel == TRUE) {
    parallel::stopCluster(cluster)

  }

  crawl <- crawl %>%
    dplyr::mutate(result = purrr::map(crawl, "result"),
                  error = purrr::map(crawl, "error"))

  result <- crawl %>%
    dplyr::filter(!is.na(result)) %>%
    dplyr::select(url, result) %>%
    dplyr::mutate(
      url_response = purrr::map_chr(result, "url"),
      date_crawl = purrr::map(result, attr, "date"),
      date_crawl = do.call(c, .data$date_crawl),
      date_crawl = lubridate::as_datetime(.data$date_crawl),
      status_code = purrr::map_int(result, "status_code")
    ) %>%
      dplyr::select(-result)

  error <- crawl %>%
    dplyr::filter(is.na(result)) %>%
    dplyr::select(url, error) %>%
    dplyr::mutate(
      error = purrr::map_chr(error, "message"),
      date_crawl = lubridate::now()
    )

  crawl <- dplyr::bind_rows(result, error) %>%
    dplyr::left_join(dplyr::select(crawl, -result, -error), ., by = "url")

  return(crawl)
}

#' Download a URL.
#'
#' @param url URL download.
#' @param timeout Timeout for HTTP request.
#' @param sleep Sleep time in seconds before download..
#' @param api_format API format. Can be json, xml or csv.
#'
#' @return The content of the downloaded URL.
#'
#' @export
download_url <- function(url, timeout = 10, sleep = NULL, api_format = NULL){

  if (!is.null(sleep)) {
    Sys.sleep(sleep)
  }

  if (is.null(api_format)) {
    download <- httr::GET(url, httr::timeout(timeout))
  } else {
    download <- switch(
      api_format,
      json = jsonlite::fromJSON(url),
      xml = httr::GET(url, httr::timeout(timeout)) %>%
        httr::content(as = "text"),
      csv = readr::read_csv(url, col_types = readr::cols())
    )
  }

  attr(download, "date") <- lubridate::now()

  return(download)
}

#' Download a file on web server.
#'
#' @param url File URL.
#' @param path Path to save the file.
#'
#' @export
download_file <- function(url, path){

  httr::GET(url, httr::write_disk(path.expand(path), overwrite = TRUE), httr::progress("down"))

}