# wrapper functions for scrapping ----
# Build css query table for years and organizations -----
#' Build css query table for years and organizations
#'
#' Connects to a remote selenium driver and scrapes the APRU website to
#' build a query table used to scrape individual race information.
#'
#' @param remDr remote driver connection object created with
#' \code{\link{connect_remDr}}.
#' @return tibble with year and organization info and corresponding css queries.
#'
#' @import RSelenium
pigeon_query_builder <-
function(remDr) {
page_source <- get_page_source(
remDr = remDr,
link = "https://pigeon-ndb.com/races/")
years <- extract_years(parsed_html = page_source)
css_query_tbl <- extract_orgs(years, remDr = remDr)
return(css_query_tbl)
}
# Scrape pigeon data -----
#' Scrapes race tale using css query table from
#' \code{\link{pigeon_query_builder}}
#'
#' Takes in a remote driver selenium driver and scrapes the APRU website to
#' using the a css query for year and organization. Parses race tables for every
#' race organized by an association within a year.
#'
#' @param css_query_entry Entry from css query tble generated by
#' \code{\link{pigeon_query_builder}}
#' @param remDr remote driver connection object created with
#' \code{\link{connect_remDr}}.
#'
#' @return list of tibble with race information and race results tables.
#'
#' @import RSelenium
scraper <-
function(css_query_entry, remDr) {
Sys.sleep(2)
# Extract race html options
race_html <- extract_race_html_options(css_query_tbl = css_query_entry,
remDr = remDr)
# parse htlm page with tables into a list of xml documents
xml_doc <- race_table_parse(race_xml_nodeset = race_html, remDr= remDr)
# Extract tables into tibbles and assemble race_results and race_info tbls
raw_tbls <- assemble_tbl(races_xml = xml_doc,
css_query_tbl = css_query_entry,
race_html = race_html)
# Pre-process tables
tbls_list <- pre_process_tbls(raw_tbls)
# remDr$close()
return(tbls_list)
}
# pigeon scraper function --------------
#' Scraping function to extract data from the APRU National Database. It uses
#' functionality from the \code{furrr} package to run multiple queries in
#' parallel.
#'
#' @param query_exists bolean to tell if if a css query table already
#' exists.
#' @param sequence integer or array of integers specifying which rows of the
#' css query table should be scraped.
#'
#' This is a do all function. It creates a query table, scrapes and processes
#' data from the APRU website in parallel. It save the output in
#' \code{data/raw_data} in single \code{.rds} files by group of queries.
#'
pigeon_scraper <-
function(query_exists, sequence) {
cat("Checking if path data/raw_data exists
if not, one will be created \n")
if(dir.exists(here::here("inst", "raw_data")) == FALSE) {
dir.create(here::here("inst", "raw_data"), recursive = TRUE)
}
start_chrome_remDr(kill = FALSE)
remDr <- connect_remDr()
remDr$open(silent = TRUE)
remDr_go_to_link(remDr = remDr, link = "https://pigeon-ndb.com/races/")
if(query_exists == FALSE) {
cat("building css query \n")
css_query_tbl <- pigeon_query_builder(remDr = remDr)
cat("saving css_query_tbl as .rds \n")
saveRDS(css_query_tbl, here::here("inst", "css_query", "css_query_tbl.rds"))
} else (
css_query_tbl <- readRDS(
file = here::here("inst", "css_query", "css_query_tbl.rds")
)
)
cat("scraping function \n")
if(is.null(sequence)){
purrr::walk(
1:nrow(css_query_tbl),
function(i){
print(i)
temp_race_data <- purrr::map(
i,
purrr::safely(function(g) {
scraper(css_query_tbl[g, ], remDr)
}))
saveRDS(temp_race_data, file = here::here(
"inst",
"raw_data",
paste0("tbl_", i, ".rds")))
Sys.sleep(.5)
}
)
} else {
purrr::walk(
sequence,
function(i){
print(i)
temp_race_data <- purrr::map(
i,
purrr::safely(function(g) {
scraper(css_query_tbl[g, ], remDr)
}))
saveRDS(temp_race_data, file = here::here(
"inst",
"raw_data",
paste0("tbl_", i, ".rds")))
Sys.sleep(.5)
}
)
}
remDr$close()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.