Nothing
#' scrape_urls: Scraping Urls from the Internet Archive
#'
#' `scrape_urls` scrapes Urls of mementos and lower-level web pages stored in the Internet Archive using XPaths as default
#'
#' @param Urls A character vector of the memento of the Internet Archive
#' @param Paths A named character vector of the content to be scraped from the memento. Takes XPath expressions as default.
#' @param collapse Logical value indicating whether to collapse matching html nodes, or character input of xpath by which matches are supposed to be collapsed. Structuring Xpaths can only be used with Xpath selectors as Paths input and CSS = FALSE. If a Xpath is given, the Paths argument only refers to children of the structure given in collapse.
#' @param startnum Specify the starting number for scraping the Urls. Important when scraping breaks during process.
#' @param attachto Scraper attaches new content to existing object in working memory. Object should stem from same scraping process.
#' @param CSS Use CSS selectors as input for the Paths
#' @param archiveDate Retrieve the archiving date
#' @param ignoreErrors Ignore errors for some Urls and proceed scraping
#' @param stopatempty Stop if scraping does not succeed
#' @param emptylim Specify the number of Urls not being scraped until break-off
#' @param encoding Specify a default encoding for the homepage. Default is 'UTF-8'
#' @param lengthwarning Warning function for large number of URLs appears. Set FALSE to disable default warning.
#' @param nonArchive Logical input. Can be set to TRUE if you want to use the archiveRetriever to scrape web pages outside the Internet Archive. Cannot be used in combination with archiveDate.
#'
#' @return This function scrapes the content of mementos or lower-level web pages from the Internet Archive. It returns a tibble including Urls and the scraped content. However, a memento being stored in the Internet Archive does not guarantee that the information from the homepage can be actually scraped. As the Internet Archive is an internet resource, it is always possible that a request fails due to connectivity problems. One easy and obvious solution is to re-try the function.
#' @examples
#' \dontrun{
#' scrape_urls(
#' Urls = "https://web.archive.org/web/20201001000859/https://www.nytimes.com/section/politics",
#' Paths = c(title = "//article/div/h2//text()", teaser = "//article/div/p/text()"),
#' collapse = FALSE, archiveDate = TRUE)
#'
#' scrape_urls(
#' Urls = "https://stackoverflow.com/questions/21167159/css-nth-match-doesnt-work",
#' Paths = c(ans="//div[@itemprop='text']/*", aut="//div[@itemprop='author']/span[@itemprop='name']"),
#' collapse = "//div[@id='answers']/div[contains(@class, 'answer')]",
#' nonArchive = TRUE,
#' encoding = "bytes")
#' }
# Importing dependencies with roxygen2
#' @importFrom xml2 read_html
#' @importFrom rvest html_elements
#' @importFrom rvest html_text
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom stringr str_length
#' @importFrom tibble tibble
#' @importFrom stats rnorm
# Export function to namespace
#' @export
### Function --------------------
scrape_urls <-
function(Urls,
Paths,
collapse = TRUE,
startnum = 1,
attachto = NULL,
CSS = FALSE,
archiveDate = FALSE,
ignoreErrors = FALSE,
stopatempty = TRUE,
emptylim = 10,
encoding = "UTF-8",
lengthwarning = TRUE,
nonArchive = FALSE) {
#### A priori consistency checks
# Globally bind variables
counter <- NULL
# If Urls is dataframe, check if output from retrieve_links. If yes, reduce to Urls of interest and generate vector
if(is.data.frame(Urls)){
if(ncol(Urls)==1){
Urls <- Urls[,1]
} else
if(ncol(Urls)>2){
stop("Urls must be vector of Urls or output from retrieve_links(). Dataframes not obtained from retrieve_links() are not allowed.")
} else
if(ncol(Urls)==2){
if (identical(names(Urls), c("baseUrl", "links"))){
Urls <- Urls$links
} else {
stop("Urls must be vector of Urls or output from retrieve_links(). Dataframes not obtained from retrieve_links() are not allowed.")
}
}
}
# Check if Urls is atomic
if(!is.atomic(Urls)){
stop("Urls must be vector of Urls or output from retrieve_links(). Other object types are not allowed.")
}
# archiveDate must be logical
if (!is.logical(archiveDate))
stop ("archiveDate is not a logical value. Please provide TRUE or FALSE.")
if (length(archiveDate) > 1)
stop ("archiveDate is not a single value. Please provide TRUE or FALSE.")
# Check if nonArchive is single value
if(length(nonArchive) > 1)
stop("nonArchive must be a single value.")
# Check if nonArchive is logical
if(!is.logical(nonArchive))
stop("nonArchive must be logical.")
# Check if nonArchive is combinded with archiveDate
if(nonArchive & archiveDate)
stop("nonArchive = TRUE cannot be used with archiveDate = TRUE.")
# Urls must start with http
if (!any(stringr::str_detect(Urls, "web\\.archive\\.org")) & !nonArchive)
stop (
"Urls do not originate from the Internet Archive. Please use the retrieveArchiveLinks function to obtain Urls from the Internet Archive."
)
# Xpath vector shall be named vector
if (is.null(names(Paths)))
stop ("Paths is not a named vector. Please provide a named vector of Xpath or CSS paths.")
# Xpath vector must be a character vector
if (!is.character(Paths))
stop (
"Paths is not a character vector. Please provide a named character vector of Xpath or CSS paths."
)
# collapse must be logical or character
if (!is.logical(collapse) & !is.character(collapse))
stop ("collapse is not a logical or character value. Please provide TRUE or FALSE, or set a xpath by which you want to structure your observations.")
# collapse must be a single value
if (length(collapse) > 1)
stop("collapse must be a single value.")
# startnum must be a single numerical value in the range of the length of Urls
if (!is.numeric(startnum))
stop (
"startnum is not numeric. Please provide a numeric indicating at which Url you want to start the scraping process."
)
if (length(startnum) > 1)
stop (
"startnum is not a single value. Please provide a single numeric indicating at which Url you want to start the scraping process."
)
if (startnum > length(Urls))
stop (
"startnum value exceeds number of Urls given. Please provide a numeric indicating at which Url you want to start the scraping process."
)
# attachto must stem from same scraping process
if (!is.null(attachto))
if (setequal(colnames(attachto),
c("Urls", names(Paths), "stoppedat")
) == FALSE)
stop ("attachto must be a failed output of this function.")
# CSS must be logical
if (!is.logical(CSS))
stop ("CSS is not a logical value. Please provide TRUE or FALSE.")
if (length(CSS) > 1)
stop ("CSS is not a single value. Please provide TRUE or FALSE.")
# Structure can only be used with CSS FALSE
if (is.character(collapse) & CSS)
stop("A structuring xpath as collapse statement can only be used with xpath. Please provide a xpath selector and set CSS to FALSE.")
# ignoreErrors must be logical
if (!is.logical(ignoreErrors))
stop ("ignoreErrors is not a logical value. Please provide TRUE or FALSE.")
if (length(ignoreErrors) > 1)
stop ("ignoreErrors is not a single value. Please provide TRUE or FALSE.")
# stopatempty must be logical
if (!is.logical(stopatempty))
stop ("stopatempty is not a logical value. Please provide TRUE or FALSE.")
if (length(stopatempty) > 1)
stop ("stopatempty is not a single value. Please provide TRUE or FALSE.")
# emptylim must be numeric
if (!is.numeric(emptylim))
stop ("emptylim is not numeric. Please provide a numeric value.")
if (length(emptylim) > 1)
stop ("emptylim is not a single value. Please provide a single numeric value.")
# Encoding must be character
if (!is.character(encoding))
stop (
"encoding is not a character value. Please provide a character string to indicate the encoding of the homepage you are about to scrape."
)
if (length(encoding) > 1)
stop (
"encoding is not a single value. Please provide a single character string to indicate the encoding of the homepage you are about to scrape."
)
# Warning and wait to proceed if large number of Urls
if (length(Urls) >= 100 & lengthwarning & interactive()) {
message(
"Warning: You are about to scrape the information from a large number of Urls. This process may take some time. Press \"y\" to proceed. \n For automated processes using a virtual machine disable this warning message with the option lengthwarning = F"
)
line <- readline()
if (line != "y") {
stop("Execution halted")
}
}
#### Main function
# Generate list for output
scrapedUrls <- vector(mode = "list", length = length(Urls))
# Generate counter for later use
counter <- 0
#Progress bar
if(length(Urls) > 1 & interactive()){
pb <- txtProgressBar(min = 0,
max = length(Urls),
style = 3)
}
for (i in (seq_len(length(Urls)-(startnum-1))+(startnum-1))) {
# Sys.sleep
if ((i > startnum) & ((i - startnum) %% 20 == 0)) {
Sys.sleep(abs(rnorm(1, 5, 2)))
}
# Set up List
data <- list()
# Avoid Urls that cannot be retrieved
possibleError <- tryCatch(
r <- httr::GET(Urls[i], httr::timeout(20)),
error = function(e)
e
)
# Skip if error
if (inherits(possibleError, "error")) {
# Fill data
data <- as.data.frame(matrix(ncol = length(Paths), nrow = 1))
# Name columns
cnames <- seq_len(length(Paths))
cnames <- paste0("Xpath", cnames)
cnames[names(Paths) != ""] <- names(Paths)[names(Paths) != ""]
colnames(data) <- cnames
# Fill in scrapedUrls List and add Url information
scrapedUrls[[i]] <- cbind(Urls[[i]], data)
colnames(scrapedUrls[[i]])[1] <- "Urls"
next
}
# Get httr status
status <- httr::status_code(r)
# Retrieve if status = 200
if (status == 200) {
possibleError <- tryCatch({
html <- xml2::read_html(r, encoding = encoding)
if (is.logical(collapse)){
# Retrieve elements and store in data list
if (CSS == T) {
# Extract nodes
for (x in seq_len(length(Paths))) {
data[[x]] <- rvest::html_elements(html, css = Paths[x])
data[[x]] <- rvest::html_text2(data[[x]])
}
} else {
# Extract nodes
for (x in seq_len(length(Paths))) {
data[[x]] <- rvest::html_elements(html, xpath = Paths[x])
data[[x]] <- rvest::html_text2(data[[x]])
}
}
# If collapse = TRUE, collapse retrieved html elements
if (collapse) {
for (x in seq_len(length(Paths))) {
data[[x]] <- paste(data[[x]], collapse = ' ')
}
}
} else {
# If collapse is structuring xpath
# Get number of structuring units in html
struck <- rvest::html_elements(html, xpath = collapse)
n <- ifelse(class(struck)=="xml_node", 1, length(struck))
# Loop over xpaths
for (x in seq_len(length(Paths))) {
# Initialize list for outcome
data[[x]] <- vector("list", n)
# Loop over structuring elements
for (s in seq_len(n)) {
data[[x]][[s]] <- rvest::html_elements(html, xpath = paste0("(", collapse, ")[", s, "]", Paths[x]))
data[[x]][[s]] <- rvest::html_text2(data[[x]][[s]])
}
# Collapse observations per structure
data[[x]] <- vapply(data[[x]], function(v) paste(v, collapse = ' '), FUN.VALUE = "xxx")
}
}
# End trycatch
},
error = function(e) e)
if(inherits(possibleError, "error")){
if(ignoreErrors == TRUE){
next
} else {
cat("ERROR in Urls[",i,"]:", conditionMessage(possibleError), "\n")
}
}
# End status == 200 if clause
} else {
# Fill data
data <- as.data.frame(matrix(ncol = length(Paths), nrow = 1))
# Name columns
cnames <- seq_len(length(Paths))
cnames <- paste0("Xpath", cnames)
cnames[names(Paths) != ""] <- names(Paths)[names(Paths) != ""]
colnames(data) <- cnames
}
# Stops
# If collapse = FALSE, stop if lengths of retrived elements differs
if (collapse == FALSE & length(unique(sapply(data, length)))!=1 & ignoreErrors == FALSE) {
# Preliminary output
predata <- do.call("rbind", scrapedUrls)
output <- tibble::tibble(predata, stoppedat = i)
warning(
paste0(
"Error in scraping of Url ",
i,
" '",
Urls[i],
"'. Number of elements for paths differs. Please provide paths that result in a similar number of elements per Url or use the option collapse = TRUE. A preliminary output has been printed."
)
)
return(output)
}
# Stop if too many empty outputs in a row
# Counter for empty outputs in a row
if(collapse == FALSE | is.character(collapse)){
if (sum(sapply(sapply(data, stringr::str_length), sum) == 0) == length(Paths)){
counter <- counter + 1
} else {
counter <- 0
}
} else {
if (sum(stringr::str_length(data) == 0) == length(Paths)){
counter <- counter + 1
} else {
counter <- 0
}
}
if (counter >= emptylim &
stopatempty == TRUE & ignoreErrors == FALSE) {
# Preliminary output
predata <- do.call("rbind", scrapedUrls)
output <- tibble::tibble(predata, stoppedat = i)
warning(
paste0(
"Error in scraping of Url ",
i,
" '",
Urls[i],
"'. Too many empty outputs in a row. A preliminary output has been printed."
)
)
return(output)
}
# Stop if non-matching number of paths could be extracted
if (collapse == FALSE | is.character(collapse)){
if (any(sapply(sapply(data, stringr::str_length), length)==0) & ignoreErrors == FALSE) {
# Preliminary output
predata <- do.call("rbind", scrapedUrls)
output <- tibble::tibble(predata, stoppedat = i)
warning(
paste0(
"Error in scraping of Url ",
i,
" '",
Urls[i],
"'. Only some of your Paths could be extracted. A preliminary output has been printed."
)
)
return(output)
}
} else {
if (length(unique(sapply(data, stringr::str_length)==0))!=1 & ignoreErrors == FALSE) {
# Preliminary output
predata <- do.call("rbind", scrapedUrls)
output <- tibble::tibble(predata, stoppedat = i)
warning(
paste0(
"Error in scraping of Url ",
i,
" '",
Urls[i],
"'. Only some of your Paths could be extracted. A preliminary output has been printed."
)
)
return(output)
}
}
# Data as dataframe
# List as dataframe
if (collapse == F){
data <- as.data.frame(sapply(data, '[', seq(max(sapply(data, length)))))
} else {
data <- as.data.frame(data)
}
# Add column names
cnames <- seq_len(length(Paths))
cnames <- paste0("Xpath", cnames)
cnames[names(Paths) != ""] <- names(Paths)[names(Paths) != ""]
colnames(data) <- cnames
# Add Url as row and save in List over all Urls
scrapedUrls[[i]] <- cbind(Urls[[i]], data)
colnames(scrapedUrls[[i]])[1] <- "Urls"
# Progress message
if(length(Urls) > 1 & interactive()){
setTxtProgressBar(pb, i)
}
} # End for loop
# Generate output dataframe
output <- do.call("rbind", scrapedUrls)
# Attach attachto to retrived data
if (!is.null(attachto)) {
output <- rbind(attachto[,-grep("stoppedat", colnames(attachto))], output)
}
# If archiveDate = TRUE, add information
if (archiveDate == TRUE) {
output$archiveDate <-
anytime::anydate(
stringr::str_extract(
output$Urls,
"(?<=\\:\\/\\/web\\.archive\\.org\\/web\\/)[0-9]{8}"
)
)
}
# Output as tibble
output <- tibble::tibble(output)
#### Return output
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.