# library(xml2)
# library(rvest)
# library(R6)
#' Static class with solutions to use when working with web
#'
#' @export
WebUtils <- R6::R6Class(classname = "WebUtils",
public = list(),
private = list())
WebUtils$lFetchTableFromUrlViaXPath <- function(lDictOfItems) {
# ----------------------------------------------------------------------------
# 0. parameters validation ---------------------------------------------------
# 0.1. check if the parameter is a list
if (!is.list(lDictOfItems)) {
stop("The lDictOfItems parameter of the function lFetchTableFromUrlViaXPath is ",
"not a list!")
}
# 0.2. check if the list length is zero
if (length(lDictOfItems) == 0L) {
stop("The lDictOfItems parameter of the function lFetchTableFromUrlViaXPath is ",
"of length zero!")
}
# 0.3. check if all the list elements are also lists of length two that have character members
if (!all(sapply(X = lDictOfItems, FUN = function(x) {
is.list(x) & length(x) == 2L &
all(sapply(X = x, FUN = function(y) { is.character(y) & length(y) == 1L } )) }))) {
stop("Not all the members of the parameter lFetchTableFromUrlViaXPath are lists ",
"of length two that have character scalar elements only! ")
}
# 0.4. check if the list is named or not
if (is.null(names(lDictOfItems))) {
warning("WARNING inside lFetchTableFromUrlViaXPath function: the parameter lDictOfItems",
" is an unnamed list!", immediate. = TRUE)
}
# ----------------------------------------------------------------------------
# 1. iterate over the elements of the list lDictOfItems and fetch the data ---
lOut <- vector(mode = "list", length = length(lDictOfItems))
for (k in 1:length(x = lDictOfItems)) {
# 1.1. create an object to store the fetched for the current webpage
cIterWebpageUrl <- lDictOfItems[[k]][[1L]]
cIterXPath <- lDictOfItems[[k]][[2L]]
message("Fetching tables from the URL: ", cIterWebpageUrl)
# 1.2. load the URL
res <- try(expr = { xml2::read_html(x = cIterWebpageUrl) }, silent = TRUE)
# 1.3. if error, skip iteration
if (methods::is(object = res, class2 = "try-error")) {
message("Failed to load the website from URL - skipping: ", cIterWebpageUrl)
next
}
# 1.4. extract the table from XPath
message("Fetching table - XPath's name: ", cIterXPath)
objIterXPathRes <-
try(expr = { rvest::html_nodes(x = res, xpath = cIterXPath) }, silent = TRUE)
# 1.5. if error, skip iteration
if (methods::is(object = objIterXPathRes, class2 = "try-error")) {
message("\t\tFailed to load the website from URL - skipping: ",
cIterWebpageUrl, cIterXPath)
next
}
# 1.6. if successful, parse the list res into data.frame
dfIterRes <-
try(expr = { rvest::html_table(x = objIterXPathRes) }, silent = TRUE)
# 1.7. if error, skip iteration
if (methods::is(object = dfIterRes, class2 = "try-error")) {
message("\t\tFailed to parse the output retrieved from the website - skipping: ",
cIterWebpageUrl, cIterXPath)
next
}
# 1.8. if all the previous steps successful, save
lOut[[k]] <- dfIterRes
}
# 1.9. if there are names on the input list, add them
if (!is.null(x = names(lDictOfItems))) {
names(lOut) <- names(lDictOfItems)
}
return(lOut)
}
WebUtils$dtFetchConstituentsOfSp500 <- function(cSource = "wikipedia") {
# 1. input validation
if (cSource != "market_insider" & cSource != "wikipedia") {
stop("Only one of the following values: 'wikipedia', 'market_insider' is available as",
"the cSource parameter of the dtFetchConstituentsOfSp500 functions is handled! ")
}
# 2. fetching table from Market Insider
if (cSource == "market_insider") {
cRootOfUrl <- "https://markets.businessinsider.com/index/components/s&p_500?p="
cXPath <- "/html/body/main/div/div[1]/div[3]/div/div[1]/div[2]/table"
# prepare list of tables addresses
lDictOfItems <- as.list(x = paste0(cRootOfUrl, seq(1L, 10L, 1L)))
lDictOfItems <- lapply(X = lDictOfItems, FUN = function(x, y) {
list(x[[1]], y) }, y = cXPath)
# call the lFetchTableFromUrlViaXPath function
res <- try(expr = {
WebUtils$lFetchTableFromUrlViaXPath(lDictOfItems = lDictOfItems)
}, silent = TRUE)
if (methods::is(object = res, class2 = "try-error")) {
# if error, return NULL
stop("Error occured inside dtFetchConstituentsOfSp500 during the call to ",
"lFetchTableFromUrlViaXPath: ", res)
} else if (is.list(x = res) | all(unlist(x = lapply(X = res, FUN = function(x) {
is.data.frame(x = x[[1]]) })))) {
# returned correctly - parse the data and return
lData <- lapply(X = res, FUN = function(x) {
data.table::as.data.table(x = x[[1]])})
dtData <- data.table::rbindlist(l = lData, use.names = TRUE, fill = TRUE)
return(dtData)
} else {
# if another object returned, throw unhandled case error
stop("Unhandled situation occured inside dtFetchConstituentsOfSp500 during the call to ",
lFetchTableFromUrlViaXPath, ": ", res)
}
}
# 3. fetching table of SP 500 components from Wikipedia
if (cSource == "wikipedia") {
cRootOfUrl <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
cXPath <- '//*[@id="constituents"]'
# prepare list of tables addresses
lDictOfItems <- list(list(cRootOfUrl, cXPath))
# call the lFetchTableFromUrlViaXPath function
res <- try(expr = {
WebUtils$lFetchTableFromUrlViaXPath(lDictOfItems = lDictOfItems)
}, silent = TRUE)
if (methods::is(object = res, class2 = "try-error")) {
# if error, return NULL
stop("Error occured inside dtFetchConstituentsOfSp500 during the call to ",
"lFetchTableFromUrlViaXPath: ", res)
} else if (is.list(x = res) | all(unlist(x = lapply(X = res, FUN = function(x) {
is.data.frame(x = x[[1]]) })))) {
# returned correctly - parse the data and return
lData <- lapply(X = res, FUN = function(x) {
data.table::as.data.table(x = x[[1]])})
dtData <- data.table::rbindlist(l = lData, use.names = TRUE, fill = TRUE)
return(dtData)
} else {
# if another object returned, throw unhandled case error
stop("Unhandled situation occured inside dtFetchConstituentsOfSp500 during the call to ",
lFetchTableFromUrlViaXPath, ": ", res)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.