# Create css queries tables ----------------
# To scrape the data I first created functions that extract the needed queries
# from the website. These will be used by the selenium functions to instruct our
# server to click through on spefic fields on the website, in order to generate
# the tables we wanto to scrape.
#' Extract css queries for race years
#'
#' Given a parsed html, it will scrape and return a list of css queries to click
#' throught the \code{Years} field.
#'
#' @param parsed_html pardsed html page using \code{\link{get_page_source}}.
#'
#' @return tibble with css queries and relevant info.
#'
#' @import dplyr
#' @import rlang
extract_years <-
function(parsed_html) {
. <- NULL
years_tib <- parsed_html %>%
#load the html that is opened on the remote desktop
# select node I need insert options
rvest::html_nodes(c("#select-year")) %>%
# load options
rvest::html_children() %>%
# parse them into text
rvest::html_text() %>%
tibble::tibble(
year = .
) %>%
# add list position
# Create js queries using css selector to be used with Rselenium
mutate(
list_position = 1:n(),
year_query = stringr::str_c(
"#select-year > option:nth-child(",
.data$list_position,
")")
)
# test if segment is loaded
return(years_tib)
}
# Extract queries for organizations -----
#' Extract queries for organizations
#'
#' Fuction that clicks on each year and extracts the queries to click on each
#' organization which has organized races.
#'
#' @param years tibble or data.drame with year information and year css queries generated by \code{\link{extract_years}}.
#' @param remDr Object class remote driver previosly connected using
#' \code{\link{connect_remDr}}.
#' @return \code{tibble} with years, organization name and css queries for both years
#' and organizations.
#'
#' @import RSelenium
#' @importFrom rlang .env
#' @importFrom rlang .data
extract_orgs <-
function(years, remDr) {
orgs <- purrr::map_dfr(
1:nrow(years),
function(y) {
#find element to click, as specified in query on years object
year_element <- remDr$findElement(
using = 'css selector', years$year_query[y])
# instruct Selenium to click on the button
year_element$clickElement()
#organizations by year_element
temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
rvest::html_nodes("#organization-selection") %>%
rvest::html_children()
# Test if the elements of the page have finished loading
# Create testing object
test_loading <- stringr::str_detect(
rvest::html_text(temp_orgs),
"Loading")
# While loading is still the element being acessed, wait 3 seconds
# if it is still the same, wait more time.
# repeat until the while evaluation is FALSE.
while (TRUE %in% test_loading) {
# Wait
Sys.sleep(3)
# Parse the html again
temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
rvest::html_nodes("#organization-selection") %>%
rvest::html_children()
# generate new testing string
test_loading <- stringr::str_detect(
rvest::html_text(temp_orgs),
"Loading")
}
# Create tibble with sotred data and css_queries to be
# ... used to scrape individual race data
# Attribute @data-orgnum as is unique to each organization and has
# ... to be used only in the recent years as an unique identifier.
temp_orgs <-
tibble::tibble (
year = years$year[y],
organization = rvest::html_text(temp_orgs),
# number of the organization indicated on the data-orgnum attribute
org_number = rvest::html_attr(temp_orgs, name = "data-orgnum")
) %>%
dplyr::mutate(
# Transfor org number
# non-exitent are "null", NA for first case of every year and
org_number = dplyr::case_when(
is.na(org_number) ~ NA_real_,
org_number == "null" ~ NA_real_,
TRUE ~ as.numeric(org_number)
),
# List position for building queries
list_position = 1:n(),
# query for year
css_query_year = years$year_query[y],
# query for organization
css_query_org = stringr::str_c(
"#organization-selection > option:nth-child(",
.data$list_position,
")")
) %>%
# Remove line with "Click" string.
dplyr::filter(stringr::str_detect(.data$organization, "Click", negate = TRUE)) %>%
dplyr::select(-.data$list_position)
return(temp_orgs)
}
)
return(orgs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.