# Parse html source with race tables ------------------
#' Parse html source with race tables
#'
#' @param css_query_tbl table with year and organization css queries, generated
#' by the function \code{\link{extract_orgs}}.
#' @param remDr Object class remote driver previosly connected using
#' \code{\link{connect_remDr}}.
#'
#' @return xml_nodeset of parsed html options to be used to build queries.
#'
#' @import RSelenium
#' @import rlang
extract_race_html_options <-
function(css_query_tbl, remDr) {
year_element <- remDr$findElement(
using = "css selector",
value = css_query_tbl$css_query_year)
# Click year element
year_element$clickElement()
temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
rvest::html_nodes("#organization-selection") %>%
rvest::html_children()
#### Check if page is loaded ####
# Test if the elements of the page have finished loading
# Create testing object
test_loading <- stringr::str_detect(rvest::html_text(temp_orgs),"Loading")
# While loading is still the element being acessed, wait 3 seconds
# if it is still the same, wait more time.
# repeat until the while evaluation is FALSE.
while (TRUE %in% test_loading) {
# Wait
Sys.sleep(1)
# Parse the html again
temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
rvest::html_nodes("#organization-selection") %>%
rvest::html_children()
# generate new testing string
test_loading <- stringr::str_detect(
rvest::html_text(temp_orgs), "Loading")
}
#### Organizations query ####
org_element <- remDr$findElement(
using = "css selector",
value = css_query_tbl$css_query_org)
org_element$clickElement()
temp_races <- xml2::read_html(org_element$getPageSource()[[1]]) %>%
rvest::html_nodes("#race-selection") %>%
rvest::html_children()
#### Test if page is loaded ####
test_loading <- stringr::str_detect(
rvest::html_text(temp_races),"Loading")
# While loading is still the element being acessed, wait 3 seconds
# if it is still the same, wait more time.
# repeat until the while evaluation is FALSE.
while (TRUE %in% test_loading) {
# Wait
Sys.sleep(1)
# Parse the html again
temp_races <- xml2::read_html(org_element$getPageSource()[[1]]) %>%
rvest::html_nodes("#race-selection") %>%
rvest::html_children()
# generate new testing string
test_loading <- stringr::str_detect(
rvest::html_text(temp_races),"Loading")
}
return(temp_races)
}
# Parse source html with race tables -----
#' Parsing html tables for each individual race in a xml_nodeset.
#'
#' @param race_xml_nodeset xml_nodeset generated by
#' \code{\link{extract_race_html_options}}.
#' @param remDr Object class remote driver previosly connected using
#'
#' @return XML document with race tables.
#'
#' @import RSelenium
#' @import rlang
race_table_parse <-
function(race_xml_nodeset, remDr) {
. <- NULL
temp_race <- purrr::map(
1:length(race_xml_nodeset),
function(race) {
css_query_race <- stringr::str_c(
"#race-selection > option:nth-child(",
race,
")")
race_element <- remDr$findElement(using = "css selector",
value = css_query_race)
race_element$clickElement()
Sys.sleep(1)
temp_race <- xml2::read_html(race_element$getPageSource()[[1]])
#### Check if table is loaded ####
# Check if table is loaded
test_loaded_table <-
temp_race %>%
rvest::html_node("table") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]] %>%
tibble::as_tibble()
# Some tables will have no data.
# So, I created a timeout for this test.
# After 5 seconds of wating there is no info on the table
# we assume that is the case and move on.
trial <- 1
while(nrow(test_loaded_table) == 0 & trial < 5) {
Sys.sleep(1)
temp_race <- xml2::read_html(race_element$getPageSource()[[1]])
test_loaded_table <-
temp_race %>%
rvest::html_node("table") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]] %>%
tibble::as_tibble()
trial = trial + 1;
} # while loop
return(temp_race)
} # map function
) # map
}
# Assemble race tables from xml documents -----
#' Assembre race tables from xml documents
#'
#' @param races_xml xml documents with race tables geneerated from
#' \code{\link{race_table_parse}}.
#' @param css_query_tbl individual css_query table used to access the
#' individual race
#' @param race_html xml_nodeset of parsed html race options. Used to extract
#' accurate information about race locations and start time.
#'
#' @return \code{list} with two tables: \code{race_results}
#' and \code{race_info}
#'
#' @importFrom rlang .data
#' @importFrom stringr fixed
#' @importFrom stringr regex
assemble_tbl <- function(
races_xml,
css_query_tbl,
race_html) {
. <- NULL
seq_xml_docs <- 1:length(races_xml)
temp_tidy_tbls <- purrr::map(
seq_xml_docs,
function(i) {
# i = 1
race <- races_xml[[i]]
### Solution for 0 valued table
### To me, the solution will to check if this is 0 valued table.
### If so, return empty tables with race_id and NA for all the rest.
### Might be important for the acount of how many races there are by
### Organization
#### create race_results table ####
race_results_tbl <- race %>%
rvest::html_nodes("table") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]] %>%
tibble::as_tibble()
if (nrow(race_results_tbl) > 0) {
race_results_tbl <-
race_results_tbl %>%
dplyr::mutate(
n = 1:n()
)
##### Test if table contains summary rows ####
# some tables will have summary rows for best performing birds by loft.
# below I test if this is the case and if so, remove the corresponding rows
# ... and the ones bellow it.
test_below <-
race_results_tbl %>%
dplyr::filter(stringr::str_detect(.data$Pos, "Below"))
if (test_below %>% tally > 0) {
limit = test_below$n - 1
race_results_tbl <-
race_results_tbl %>%
dplyr::filter(between(row_number(), 1, limit))
}
### Create race_info table ####
# this is a mixture of info present at:
# .... - the css_query_css_query_tbl object
# .... - scrapping info from the race specific headed on the website
# For both the css paths and xpath I used Firefox developers tools
# I also create a unique key for each race based on
# ... the race organization, year and the race list position
} else {
race_results_tbl <- race_results_tbl[1,]
race_results_tbl <-
race_results_tbl %>%
dplyr::mutate_all(as.character) %>%
dplyr::mutate(n = NA_character_)
}
race_info_tbl <-
tibble::tibble(
raw_info = race_html[[i]] %>%
rvest::html_text(),
year = css_query_tbl$year,
organization = css_query_tbl$organization,
org_number = css_query_tbl$org_number,
date = rvest::html_nodes(race, "#race-selection") %>%
rvest::html_children() %>%
rvest::html_attr("data-date") %>%
.[i] %>%
lubridate::mdy(),
raw_location = rvest::html_nodes(race, "#race-selection") %>%
rvest::html_children() %>%
rvest::html_attr("data-id") %>%
.[i],
raw_release_weather =
rvest::html_node(
race,
css = paste(
".race-results > div:nth-child(1)",
"> div:nth-child(1) > div:nth-child(2)",
sep = "")
) %>%
rvest::html_text(),
# html is defected, have to copy the whole file and fix latter
raw_arrival_weather = rvest::html_node(
race,
xpath = "/html/body/div[3]/div[2]/div/div") %>%
rvest::html_text(),
raw_text = rvest::html_node(
race,
xpath = "/html/body/div[3]/div[2]"
) %>%
rvest::html_text(),
# race_id = stringr::str_c(
# .data$organization %>%
# stringr::str_replace_all(
# fixed(" "),
# replacement = "_") %>%
# stringr::str_replace_all(
# fixed("_-_"),
# replacement = "_") %>%
# stringr::str_to_lower(),
# "_race_",
# i
# )
race_id = stringr::str_c(
"y",
stringr::str_extract(css_query_tbl$css_query_year, regex("\\d{1,}")),
"o",
stringr::str_extract(css_query_tbl$css_query_org, regex("\\d{1,}")),
"r",
i
)
) %>%
dplyr::select(.data$race_id, everything())
#### Add unique identifier to the restuls table ####
race_results_tbl <-
race_results_tbl %>%
dplyr::mutate(race_id = race_info_tbl$race_id) %>%
dplyr::select(.data$race_id, everything())
#### Generate output####
output <- list(
"race_results_tbl" = race_results_tbl,
"race_info_tbl" = race_info_tbl
)
return(output)
})
return(temp_tidy_tbls)
}
# Table pre-processing -------
#' Table pre-process and aggregation by organization and by year
#'
#' @param tbls_list list of tables generated from \code{\link{assemble_tbl}}.
#'
#' @return list of tables by year and organization containing both iformation on
#' individual races and results for each pigeon entry in an individual race.
#'
#' @importFrom rlang .data
pre_process_tbls <- function(tbls_list) {
. <- NULL
tbls <-
list(
##### Bind rows for race_info ####
"race_info" = purrr::map_dfr(
tbls_list,
function(i) {
i$race_info_tbl
}),
##### Pre-process and bind rows for race_results ####
"race_results" = purrr::map(
tbls_list,
function(i) {
temp_tbl <- i$race_results_tbl
temp_tbl <- temp_tbl %>%
dplyr::mutate_all (as.character)
temp_tbl <- temp_tbl %>%
dplyr::arrange(.data$race_id) %>%
dplyr::rename(
to_win = .data$`To Win`,
ndb_points = .data$`WS Points`
)
temp_tbl <- temp_tbl %>%
dplyr::mutate(
Pos = as.integer(.data$Pos),
Section = dplyr::case_when(
.data$Section == "" ~ NA_character_,
.data$Section == "NA" ~ NA_character_,
TRUE ~ .data$Section
),
# Arrival = Arrival %>% lubridate::hms()
Miles = as.numeric(.data$Miles),
# to_win = as.numeric(to_win),
YPM = as.numeric(.data$YPM)
)
temp_tbl <- temp_tbl %>%
dplyr::select(-.data$n, -.data$`NDB Points`)
return(temp_tbl)
}) %>% do.call("rbind", .)
)
return(tbls)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.