#' Crawl URLs in a data frame.
#'
#' @param urls URLs to crawl.
#' @param parallel If \code{TRUE} then URLs are crawled using multiple CPU cores (using parallel and pbapply packages).
#' @param progress_bar If \code{TRUE} then a progress bar is displayed (using pbapply package).
#' @param message If \code{TRUE} then a message indicates how URLs are crawled.
#' @param ... Optional arguments from \code{webr::download_url}
#'
#' @return A data frame with one URL per row.\cr
#'
#' This data frame contains the following colmuns :\cr
#' - \code{url} : initial URL\cr
#' - \code{crawl} : adresse nettoyée\cr
#' - \code{url_response} : returned URL\cr
#' - \code{date_crawl} : download datetime\cr
#' - \code{status_code} : status code return by HTTP request\cr
#' - \code{error} : error message
#'
#' @export
crawl_urls <- function(urls, parallel = FALSE, progress_bar = FALSE, message = FALSE, ...) {
if (length(urls) == 0) {
stop("There is no new urls to crawl", call. = FALSE)
}
crawl <- dplyr::tibble(url = urls)
if (message == TRUE) {
message(nrow(crawl)," url(s) are crawled")
}
if (progress_bar == TRUE & !"pbapply" %in% utils::installed.packages()[, 1]) {
stop("pbapply package needs to be installed", call. = FALSE)
}
if (progress_bar == TRUE) {
pbapply::pboptions(type = "timer")
}
if (parallel == TRUE & !all(c("parallel", "pbapply") %in% utils::installed.packages()[, 1])) {
stop("parallel and pbapply packages need to be installed", call. = FALSE)
}
if (parallel == TRUE) {
cluster <- parallel::makeCluster(parallel::detectCores())
} else {
cluster <- NULL
}
if (progress_bar == TRUE | parallel == TRUE) {
crawl$crawl <- crawl$url %>%
pbapply::pblapply(purrr::safely(webr::download_url, otherwise = NA), ..., cl = cluster)
} else {
crawl$crawl <- crawl$url %>%
lapply(purrr::safely(webr::download_url, otherwise = NA), ...)
}
if (parallel == TRUE) {
parallel::stopCluster(cluster)
}
crawl <- crawl %>%
dplyr::mutate(result = purrr::map(crawl, "result"),
error = purrr::map(crawl, "error"))
result <- crawl %>%
dplyr::filter(!is.na(result)) %>%
dplyr::select(url, result) %>%
dplyr::mutate(
url_response = purrr::map_chr(result, "url"),
date_crawl = purrr::map(result, attr, "date"),
date_crawl = do.call(c, .data$date_crawl),
date_crawl = lubridate::as_datetime(.data$date_crawl),
status_code = purrr::map_int(result, "status_code")
) %>%
dplyr::select(-result)
error <- crawl %>%
dplyr::filter(is.na(result)) %>%
dplyr::select(url, error) %>%
dplyr::mutate(
error = purrr::map_chr(error, "message"),
date_crawl = lubridate::now()
)
crawl <- dplyr::bind_rows(result, error) %>%
dplyr::left_join(dplyr::select(crawl, -result, -error), ., by = "url")
return(crawl)
}
#' Download a URL.
#'
#' @param url URL download.
#' @param timeout Timeout for HTTP request.
#' @param sleep Sleep time in seconds before download..
#' @param api_format API format. Can be json, xml or csv.
#'
#' @return The content of the downloaded URL.
#'
#' @export
download_url <- function(url, timeout = 10, sleep = NULL, api_format = NULL){
if (!is.null(sleep)) {
Sys.sleep(sleep)
}
if (is.null(api_format)) {
download <- httr::GET(url, httr::timeout(timeout))
} else {
download <- switch(
api_format,
json = jsonlite::fromJSON(url),
xml = httr::GET(url, httr::timeout(timeout)) %>%
httr::content(as = "text"),
csv = readr::read_csv(url, col_types = readr::cols())
)
}
attr(download, "date") <- lubridate::now()
return(download)
}
#' Download a file on web server.
#'
#' @param url File URL.
#' @param path Path to save the file.
#'
#' @export
download_file <- function(url, path){
httr::GET(url, httr::write_disk(path.expand(path), overwrite = TRUE), httr::progress("down"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.