R/scrape_urls.R
In archiveRetriever: Retrieve Archived Web Pages from the 'Internet Archive'

Documented in scrape_urls

#' scrape_urls: Scraping Urls from the Internet Archive
#'
#' `scrape_urls` scrapes Urls of mementos and lower-level web pages stored in the Internet Archive using XPaths as default
#'
#' @param Urls A character vector of the memento of the Internet Archive
#' @param Paths A named character vector of the content to be scraped from the memento. Takes XPath expressions as default.
#' @param collapse Logical value indicating whether to collapse matching html nodes, or character input of xpath by which matches are supposed to be collapsed. Structuring Xpaths can only be used with Xpath selectors as Paths input and CSS = FALSE. If a Xpath is given, the Paths argument only refers to children of the structure given in collapse.
#' @param startnum Specify the starting number for scraping the Urls. Important when scraping breaks during process.
#' @param attachto Scraper attaches new content to existing object in working memory. Object should stem from same scraping process.
#' @param CSS Use CSS selectors as input for the Paths
#' @param archiveDate Retrieve the archiving date
#' @param ignoreErrors Ignore errors for some Urls and proceed scraping
#' @param stopatempty Stop if scraping does not succeed
#' @param emptylim Specify the number of Urls not being scraped until break-off
#' @param encoding  	Specify a default encoding for the homepage. Default is 'UTF-8'
#' @param lengthwarning Warning function for large number of URLs appears. Set FALSE to disable default warning.
#' @param nonArchive Logical input. Can be set to TRUE if you want to use the archiveRetriever to scrape web pages outside the Internet Archive. Cannot be used in combination with archiveDate.
#'
#' @return This function scrapes the content of mementos or lower-level web pages from the Internet Archive. It returns a tibble including Urls and the scraped content. However, a memento being stored in the Internet Archive does not guarantee that the information from the homepage can be actually scraped. As the Internet Archive is an internet resource, it is always possible that a request fails due to connectivity problems. One easy and obvious solution is to re-try the function.
#' @examples
#' \dontrun{
#' scrape_urls(
#' Urls = "https://web.archive.org/web/20201001000859/https://www.nytimes.com/section/politics",
#' Paths = c(title = "//article/div/h2//text()", teaser = "//article/div/p/text()"),
#' collapse = FALSE, archiveDate = TRUE)
#'
#' scrape_urls(
#'  Urls = "https://stackoverflow.com/questions/21167159/css-nth-match-doesnt-work",
#'  Paths = c(ans="//div[@itemprop='text']/*", aut="//div[@itemprop='author']/span[@itemprop='name']"),
#'  collapse = "//div[@id='answers']/div[contains(@class, 'answer')]",
#'  nonArchive = TRUE,
#'  encoding = "bytes")
#' }

# Importing dependencies with roxygen2
#' @importFrom xml2 read_html
#' @importFrom rvest html_elements
#' @importFrom rvest html_text
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom stringr str_length
#' @importFrom tibble tibble
#' @importFrom stats rnorm

# Export function to namespace
#' @export



### Function --------------------

scrape_urls <-
  function(Urls,
           Paths,
           collapse = TRUE,
           startnum = 1,
           attachto = NULL,
           CSS = FALSE,
           archiveDate = FALSE,
           ignoreErrors = FALSE,
           stopatempty = TRUE,
           emptylim = 10,
           encoding = "UTF-8",
           lengthwarning = TRUE,
           nonArchive = FALSE) {


  #### A priori consistency checks

    # Globally bind variables
    counter <- NULL


    # If Urls is dataframe, check if output from retrieve_links. If yes, reduce to Urls of interest and generate vector
    if(is.data.frame(Urls)){
      if(ncol(Urls)==1){
        Urls <- Urls[,1]
      } else
        if(ncol(Urls)>2){
          stop("Urls must be vector of Urls or output from retrieve_links(). Dataframes not obtained from retrieve_links() are not allowed.")
        } else
          if(ncol(Urls)==2){
            if (identical(names(Urls), c("baseUrl", "links"))){
              Urls <- Urls$links
            } else {
              stop("Urls must be vector of Urls or output from retrieve_links(). Dataframes not obtained from retrieve_links() are not allowed.")
            }
          }
    }

    # Check if Urls is atomic
    if(!is.atomic(Urls)){
      stop("Urls must be vector of Urls or output from retrieve_links(). Other object types are not allowed.")
    }


    # archiveDate must be logical
    if (!is.logical(archiveDate))
      stop ("archiveDate is not a logical value. Please provide TRUE or FALSE.")

    if (length(archiveDate) > 1)
      stop ("archiveDate is not a single value. Please provide TRUE or FALSE.")

    # Check if nonArchive is single value
    if(length(nonArchive) > 1)
      stop("nonArchive must be a single value.")

    # Check if nonArchive is logical
    if(!is.logical(nonArchive))
      stop("nonArchive must be logical.")

    # Check if nonArchive is combinded with archiveDate
    if(nonArchive & archiveDate)
      stop("nonArchive = TRUE cannot be used with archiveDate = TRUE.")

    # Urls must start with http
    if (!any(stringr::str_detect(Urls, "web\\.archive\\.org")) & !nonArchive)
      stop (
        "Urls do not originate from the Internet Archive. Please use the retrieveArchiveLinks function to obtain Urls from the Internet Archive."
      )

    # Xpath vector shall be named vector
    if (is.null(names(Paths)))
      stop ("Paths is not a named vector. Please provide a named vector of Xpath or CSS paths.")

    # Xpath vector must be a character vector
    if (!is.character(Paths))
      stop (
        "Paths is not a character vector. Please provide a named character vector of Xpath or CSS paths."
      )

    # collapse must be logical or character
    if (!is.logical(collapse) & !is.character(collapse))
      stop ("collapse is not a logical or character value. Please provide TRUE or FALSE, or set a xpath by which you want to structure your observations.")

    # collapse must be a single value
    if (length(collapse) > 1)
      stop("collapse must be a single value.")


    # startnum must be a single numerical value in the range of the length of Urls
    if (!is.numeric(startnum))
      stop (
        "startnum is not numeric. Please provide a numeric indicating at which Url you want to start the scraping process."
      )

    if (length(startnum) > 1)
      stop (
        "startnum is not a single value. Please provide a single numeric indicating at which Url you want to start the scraping process."
      )

    if (startnum > length(Urls))
      stop (
        "startnum value exceeds number of Urls given. Please provide a numeric indicating at which Url you want to start the scraping process."
      )


    # attachto must stem from same scraping process
    if (!is.null(attachto))
      if (setequal(colnames(attachto),
                   c("Urls", names(Paths), "stoppedat")
      ) == FALSE)
        stop ("attachto must be a failed output of this function.")


    # CSS must be logical
    if (!is.logical(CSS))
      stop ("CSS is not a logical value. Please provide TRUE or FALSE.")

    if (length(CSS) > 1)
      stop ("CSS is not a single value. Please provide TRUE or FALSE.")

    # Structure can only be used with CSS FALSE
    if (is.character(collapse) & CSS)
      stop("A structuring xpath as collapse statement can only be used with xpath. Please provide a xpath selector and set CSS to FALSE.")


    # ignoreErrors must be logical
    if (!is.logical(ignoreErrors))
      stop ("ignoreErrors is not a logical value. Please provide TRUE or FALSE.")

    if (length(ignoreErrors) > 1)
      stop ("ignoreErrors is not a single value. Please provide TRUE or FALSE.")

    # stopatempty must be logical
    if (!is.logical(stopatempty))
      stop ("stopatempty is not a logical value. Please provide TRUE or FALSE.")

    if (length(stopatempty) > 1)
      stop ("stopatempty is not a single value. Please provide TRUE or FALSE.")


    # emptylim must be numeric
    if (!is.numeric(emptylim))
      stop ("emptylim is not numeric. Please provide a numeric value.")

    if (length(emptylim) > 1)
      stop ("emptylim is not a single value. Please provide a single numeric value.")

    # Encoding must be character
    if (!is.character(encoding))
      stop (
        "encoding is not a character value. Please provide a character string to indicate the encoding of the homepage you are about to scrape."
      )

    if (length(encoding) > 1)
      stop (
        "encoding is not a single value. Please provide a single character string to indicate the encoding of the homepage you are about to scrape."
      )

    # Warning and wait to proceed if large number of Urls
    if (length(Urls) >= 100 & lengthwarning & interactive()) {
      message(
        "Warning: You are about to scrape the information from a large number of Urls. This process may take some time. Press \"y\" to proceed. \n For automated processes using a virtual machine disable this warning message with the option lengthwarning = F"
      )
      line <- readline()

      if (line != "y") {
        stop("Execution halted")
      }

    }



#### Main function

# Generate list for output
scrapedUrls <- vector(mode = "list", length = length(Urls))

# Generate counter for later use
counter <- 0


#Progress bar
if(length(Urls) > 1 & interactive()){
  pb <- txtProgressBar(min = 0,
                       max = length(Urls),
                       style = 3)
}


for (i in (seq_len(length(Urls)-(startnum-1))+(startnum-1))) {

  # Sys.sleep
  if ((i > startnum) & ((i - startnum) %% 20 == 0)) {
    Sys.sleep(abs(rnorm(1, 5, 2)))
  }

  # Set up List
  data <- list()

  # Avoid Urls that cannot be retrieved
  possibleError <- tryCatch(
    r <- httr::GET(Urls[i], httr::timeout(20)),
    error = function(e)
      e
  )

  # Skip if error
  if (inherits(possibleError, "error")) {
    # Fill data
    data <- as.data.frame(matrix(ncol = length(Paths), nrow = 1))

    # Name columns
    cnames <- seq_len(length(Paths))
    cnames <- paste0("Xpath", cnames)
    cnames[names(Paths) != ""] <- names(Paths)[names(Paths) != ""]

    colnames(data) <- cnames

    # Fill in scrapedUrls List and add Url information
    scrapedUrls[[i]] <- cbind(Urls[[i]], data)
    colnames(scrapedUrls[[i]])[1] <- "Urls"
    next
  }

  # Get httr status
  status <- httr::status_code(r)

  # Retrieve if status = 200
  if (status == 200) {

    possibleError <- tryCatch({
      html <- xml2::read_html(r, encoding = encoding)


      if (is.logical(collapse)){
        # Retrieve elements and store in data list
        if (CSS == T) {
          # Extract nodes
          for (x in seq_len(length(Paths))) {
            data[[x]] <- rvest::html_elements(html, css = Paths[x])
            data[[x]] <- rvest::html_text2(data[[x]])
          }
        } else {
          # Extract nodes
          for (x in seq_len(length(Paths))) {
            data[[x]] <- rvest::html_elements(html, xpath = Paths[x])
            data[[x]] <- rvest::html_text2(data[[x]])
          }
        }

        # If collapse = TRUE, collapse retrieved html elements
        if (collapse) {
          for (x in seq_len(length(Paths))) {
            data[[x]] <- paste(data[[x]], collapse = ' ')
          }
        }
      } else {
      # If collapse is structuring xpath
        # Get number of structuring units in html
        struck <- rvest::html_elements(html, xpath = collapse)
        n <- ifelse(class(struck)=="xml_node", 1, length(struck))

        # Loop over xpaths
        for (x in seq_len(length(Paths))) {
          # Initialize list for outcome
          data[[x]] <- vector("list", n)

          # Loop over structuring elements
          for (s in seq_len(n)) {
            data[[x]][[s]] <- rvest::html_elements(html, xpath = paste0("(", collapse, ")[", s, "]", Paths[x]))
            data[[x]][[s]] <- rvest::html_text2(data[[x]][[s]])
          }
          # Collapse observations per structure
          data[[x]] <- vapply(data[[x]], function(v) paste(v, collapse = ' '), FUN.VALUE = "xxx")
        }
      }

      # End trycatch
    },
    error = function(e) e)

    if(inherits(possibleError, "error")){
      if(ignoreErrors == TRUE){
        next
      } else {
        cat("ERROR in Urls[",i,"]:", conditionMessage(possibleError), "\n")
      }

    }

    # End status == 200 if clause
  } else {
    # Fill data
    data <- as.data.frame(matrix(ncol = length(Paths), nrow = 1))

    # Name columns
    cnames <- seq_len(length(Paths))
    cnames <- paste0("Xpath", cnames)
    cnames[names(Paths) != ""] <- names(Paths)[names(Paths) != ""]

    colnames(data) <- cnames
  }


  # Stops

    # If collapse = FALSE, stop if lengths of retrived elements differs
    if (collapse == FALSE & length(unique(sapply(data, length)))!=1 & ignoreErrors == FALSE) {

      # Preliminary output
      predata <- do.call("rbind", scrapedUrls)

      output <- tibble::tibble(predata, stoppedat = i)

      warning(
        paste0(
          "Error in scraping of Url ",
          i,
          " '",
          Urls[i],
          "'. Number of elements for paths differs. Please provide paths that result in a similar number of elements per Url or use the option collapse = TRUE. A preliminary output has been printed."
        )
      )
      return(output)
    }


    # Stop if too many empty outputs in a row

    # Counter for empty outputs in a row
    if(collapse == FALSE | is.character(collapse)){

      if (sum(sapply(sapply(data, stringr::str_length), sum) == 0) == length(Paths)){
        counter <-  counter + 1
      } else {
        counter <- 0
      }
    } else {

      if (sum(stringr::str_length(data) == 0) == length(Paths)){
        counter <-  counter + 1
      } else {
        counter <- 0
      }
    }


    if (counter >= emptylim &
        stopatempty == TRUE & ignoreErrors == FALSE) {

      # Preliminary output
      predata <- do.call("rbind", scrapedUrls)

      output <- tibble::tibble(predata, stoppedat = i)

      warning(
        paste0(
          "Error in scraping of Url ",
          i,
          " '",
          Urls[i],
          "'. Too many empty outputs in a row. A preliminary output has been printed."
        )
      )
      return(output)
    }


    # Stop if non-matching number of paths could be extracted
    if (collapse == FALSE | is.character(collapse)){
      if (any(sapply(sapply(data, stringr::str_length), length)==0) & ignoreErrors == FALSE) {

        # Preliminary output
        predata <- do.call("rbind", scrapedUrls)

        output <- tibble::tibble(predata, stoppedat = i)

        warning(
          paste0(
            "Error in scraping of Url ",
            i,
            " '",
            Urls[i],
            "'. Only some of your Paths could be extracted. A preliminary output has been printed."
          )
        )
        return(output)

      }
    } else {
      if (length(unique(sapply(data, stringr::str_length)==0))!=1 & ignoreErrors == FALSE) {

        # Preliminary output
        predata <- do.call("rbind", scrapedUrls)

        output <- tibble::tibble(predata, stoppedat = i)

        warning(
          paste0(
            "Error in scraping of Url ",
            i,
            " '",
            Urls[i],
            "'. Only some of your Paths could be extracted. A preliminary output has been printed."
          )
        )
        return(output)

      }
    }



  # Data as dataframe

    # List as dataframe
    if (collapse == F){
      data <- as.data.frame(sapply(data, '[', seq(max(sapply(data, length)))))
    } else {
      data <- as.data.frame(data)
    }


    # Add column names
    cnames <- seq_len(length(Paths))
    cnames <- paste0("Xpath", cnames)
    cnames[names(Paths) != ""] <- names(Paths)[names(Paths) != ""]

    colnames(data) <- cnames


    # Add Url as row and save in List over all Urls
    scrapedUrls[[i]] <- cbind(Urls[[i]], data)
    colnames(scrapedUrls[[i]])[1] <- "Urls"


    # Progress message
    if(length(Urls) > 1 & interactive()){
      setTxtProgressBar(pb, i)
    }

} # End for loop

  # Generate output dataframe
  output <- do.call("rbind", scrapedUrls)

  # Attach attachto to retrived data
  if (!is.null(attachto)) {
    output <- rbind(attachto[,-grep("stoppedat", colnames(attachto))], output)
  }

  # If archiveDate = TRUE, add information
  if (archiveDate == TRUE) {
    output$archiveDate <-
      anytime::anydate(
        stringr::str_extract(
          output$Urls,
          "(?<=\\:\\/\\/web\\.archive\\.org\\/web\\/)[0-9]{8}"
        )
      )
  }

  # Output as tibble
  output <- tibble::tibble(output)

  #### Return output
  return(output)

}