R/images_noalt_scrap.R

Defines functions images_noalt_scrap

Documented in images_noalt_scrap

#' Scrape Images URLS that don't have 'alt' attributes
#'
#' @param link the URL of the web page
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#'
#' @return a character vector of images' URL without "alt" attribute
#' @export
#'
#' @examples \donttest{
#'
#' images_noalt_scrap(link = "https://www.r-consortium.org/")
#'
#' }
#'
#' @importFrom rvest html_nodes html_attr %>%
#' @importFrom xml2 read_html
#' @importFrom robotstxt paths_allowed
#' @importFrom crayon green
#' @importFrom crayon bgRed
#' @importFrom curl has_internet
#' @importFrom utils download.file

images_noalt_scrap <- function(link, askRobot = FALSE) {
  if (missing(link)) {
    stop("the 'link' paramater is mandatory")
  }

  if (!is.character(link)) {
    stop("the 'link' parameter must be provided
         as a character string")
  }

  ####### Ask robot related ################################################################
  if (askRobot) {
    if (paths_allowed(link) == TRUE) {
      message(green("the robot.txt doesn't prohibit scraping this web page"))
    }
    else {
      message(bgRed(
        "WARNING: the robot.txt doesn't allow scraping this web page"
      ))
    }
  }
  ##########################################################################################
  tryCatch(
    expr = {
      img_urls <- lapply(link, function(url) {
        url %>%
          read_html() %>%
          html_nodes("img:not([alt])")
      })

      final_urls <- noquote(as.character(img_urls[[1]]))

      if (length(final_urls) == 0) {
        message(paste0("No images without 'alt' attribute found at: ", link))
        return(NULL)
      } else {
        return(final_urls)
      }
    },

    error = function(cond) {
      if (!has_internet()) {
        message(paste0("Please check your internet connexion: ", cond))
        return(NA)
      } else if (grepl("current working directory", cond) || grepl("HTTP error 404", cond)) {
        message(paste0("The URL doesn't seem to be a valid one: ", link))
        message(paste0("Here the original error message: ", cond))
        return(NA)
      } else {
        message(paste0("Undefined Error: ", cond))
        return(NA)
      }
    }
  )
}

Try the ralger package in your browser

Any scripts or data that you put into this service are public.

ralger documentation built on March 18, 2021, 1:06 a.m.