R/WebUtils.R

Defines functions dtFetchConstituentsOfSp500 lFetchTableFromUrlViaXPath

# library(xml2)
# library(rvest)
# library(R6)

#' Static class with solutions to use when working with web
#'
#' @export
WebUtils <- R6::R6Class(classname = "WebUtils",
                        public = list(),
                        private = list())



WebUtils$lFetchTableFromUrlViaXPath <- function(lDictOfItems) {

  # ----------------------------------------------------------------------------
  # 0. parameters validation ---------------------------------------------------
  # 0.1. check if the parameter is a list
  if (!is.list(lDictOfItems)) {
    stop("The lDictOfItems parameter of the function lFetchTableFromUrlViaXPath is ",
         "not a list!")
  }
  # 0.2. check if the list length is zero
  if (length(lDictOfItems) == 0L) {
    stop("The lDictOfItems parameter of the function lFetchTableFromUrlViaXPath is ",
         "of length zero!")
  }
  # 0.3. check if all the list elements are also lists of length two that have character members
  if (!all(sapply(X = lDictOfItems, FUN = function(x) {
    is.list(x) & length(x) == 2L &
      all(sapply(X = x, FUN = function(y) { is.character(y) & length(y) == 1L } )) }))) {
    stop("Not all the members of the parameter lFetchTableFromUrlViaXPath are lists ",
         "of length two that have character scalar elements only! ")
  }
  # 0.4. check if the list is named or not
  if (is.null(names(lDictOfItems))) {
    warning("WARNING inside lFetchTableFromUrlViaXPath function: the parameter lDictOfItems",
            " is an unnamed list!", immediate. = TRUE)
  }

  # ----------------------------------------------------------------------------
  # 1. iterate over the elements of the list lDictOfItems and fetch the data ---
  lOut <- vector(mode = "list", length = length(lDictOfItems))
  for (k in 1:length(x = lDictOfItems)) {
    # 1.1. create an object to store the fetched for the current webpage
    cIterWebpageUrl <- lDictOfItems[[k]][[1L]]
    cIterXPath <- lDictOfItems[[k]][[2L]]
    message("Fetching tables from the URL: ", cIterWebpageUrl)
    # 1.2. load the URL
    res <- try(expr = { xml2::read_html(x = cIterWebpageUrl) }, silent = TRUE)
    # 1.3. if error, skip iteration
    if (methods::is(object = res, class2 = "try-error")) {
      message("Failed to load the website from URL - skipping: ", cIterWebpageUrl)
      next
    }
    # 1.4. extract the table from XPath
    message("Fetching table - XPath's name: ", cIterXPath)
    objIterXPathRes <-
      try(expr = { rvest::html_nodes(x = res, xpath = cIterXPath) }, silent = TRUE)
    # 1.5. if error, skip iteration
    if (methods::is(object = objIterXPathRes, class2 = "try-error")) {
      message("\t\tFailed to load the website from URL - skipping: ",
              cIterWebpageUrl, cIterXPath)
      next
    }
    # 1.6. if successful, parse the list res into data.frame
    dfIterRes <-
      try(expr = { rvest::html_table(x = objIterXPathRes) }, silent = TRUE)
    # 1.7.  if error, skip iteration
    if (methods::is(object = dfIterRes, class2 = "try-error")) {
      message("\t\tFailed to parse the output retrieved from the website - skipping: ",
              cIterWebpageUrl, cIterXPath)
      next
    }
    # 1.8. if all the previous steps successful, save
    lOut[[k]] <- dfIterRes
  }
  # 1.9. if there are names on the input list, add them
  if (!is.null(x = names(lDictOfItems))) {
    names(lOut) <- names(lDictOfItems)
  }

  return(lOut)
}



WebUtils$dtFetchConstituentsOfSp500 <- function(cSource = "wikipedia") {

  # 1. input validation
  if (cSource != "market_insider" & cSource != "wikipedia") {
    stop("Only one of the following values: 'wikipedia', 'market_insider' is available as",
         "the cSource parameter of the dtFetchConstituentsOfSp500 functions is handled! ")
  }

  # 2. fetching table from Market Insider
  if (cSource == "market_insider") {
    cRootOfUrl <- "https://markets.businessinsider.com/index/components/s&p_500?p="
    cXPath <- "/html/body/main/div/div[1]/div[3]/div/div[1]/div[2]/table"

    # prepare list of tables addresses
    lDictOfItems <- as.list(x = paste0(cRootOfUrl, seq(1L, 10L, 1L)))
    lDictOfItems <- lapply(X = lDictOfItems, FUN = function(x, y) {
      list(x[[1]], y) }, y = cXPath)

    # call the lFetchTableFromUrlViaXPath function
    res <- try(expr = {
      WebUtils$lFetchTableFromUrlViaXPath(lDictOfItems = lDictOfItems)
    }, silent = TRUE)

    if (methods::is(object = res, class2 = "try-error")) {
      # if error, return NULL
      stop("Error occured inside dtFetchConstituentsOfSp500 during the call to ",
           "lFetchTableFromUrlViaXPath: ", res)
    } else if (is.list(x = res) | all(unlist(x = lapply(X = res, FUN = function(x) {
      is.data.frame(x = x[[1]]) })))) {
      # returned correctly - parse the data and return
      lData <- lapply(X = res, FUN = function(x) {
        data.table::as.data.table(x = x[[1]])})
      dtData <- data.table::rbindlist(l = lData, use.names = TRUE, fill = TRUE)
      return(dtData)
    } else {
      # if another object returned, throw unhandled case error
      stop("Unhandled situation occured inside dtFetchConstituentsOfSp500 during the call to ",
           lFetchTableFromUrlViaXPath, ": ", res)
    }


  }

  # 3. fetching table of SP 500 components from Wikipedia
  if (cSource == "wikipedia") {

    cRootOfUrl <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    cXPath <- '//*[@id="constituents"]'

    # prepare list of tables addresses
    lDictOfItems <- list(list(cRootOfUrl, cXPath))

    # call the lFetchTableFromUrlViaXPath function
    res <- try(expr = {
      WebUtils$lFetchTableFromUrlViaXPath(lDictOfItems = lDictOfItems)
    }, silent = TRUE)

    if (methods::is(object = res, class2 = "try-error")) {
      # if error, return NULL
      stop("Error occured inside dtFetchConstituentsOfSp500 during the call to ",
           "lFetchTableFromUrlViaXPath: ", res)
    } else if (is.list(x = res) | all(unlist(x = lapply(X = res, FUN = function(x) {
      is.data.frame(x = x[[1]]) })))) {
      # returned correctly - parse the data and return
      lData <- lapply(X = res, FUN = function(x) {
        data.table::as.data.table(x = x[[1]])})
      dtData <- data.table::rbindlist(l = lData, use.names = TRUE, fill = TRUE)
      return(dtData)
    } else {
      # if another object returned, throw unhandled case error
      stop("Unhandled situation occured inside dtFetchConstituentsOfSp500 during the call to ",
           lFetchTableFromUrlViaXPath, ": ", res)
    }
  }

}
wegar-2/VariousUtils documentation built on Aug. 25, 2020, 4:02 p.m.