R/query_scraping_funs.R

Defines functions pigeon_scraper scraper pigeon_query_builder

Documented in pigeon_query_builder pigeon_scraper scraper

# wrapper functions for scrapping ----

# Build css query table for years and organizations -----

#' Build css query table for years and organizations
#'
#' Connects to a remote selenium driver and scrapes the APRU website to
#' build a query table used to scrape individual race information.
#'
#' @param remDr remote driver connection object created with
#' \code{\link{connect_remDr}}.
#' @return tibble with year and organization info and corresponding css queries.
#'
#' @import RSelenium

pigeon_query_builder <-
  function(remDr) {
    page_source <- get_page_source(
      remDr = remDr,
      link = "https://pigeon-ndb.com/races/")
    years <- extract_years(parsed_html = page_source)
    css_query_tbl <- extract_orgs(years, remDr = remDr)
    return(css_query_tbl)
  }



# Scrape pigeon data -----

#' Scrapes race tale using css query table from
#' \code{\link{pigeon_query_builder}}
#'
#' Takes in a remote driver selenium driver and scrapes the APRU website to
#' using the a css query for year and organization. Parses race tables for every
#' race organized by an association within a year.
#'
#' @param css_query_entry Entry from css query tble generated by
#' \code{\link{pigeon_query_builder}}
#' @param remDr remote driver connection object created with
#' \code{\link{connect_remDr}}.
#'
#' @return list of tibble with race information and race results tables.
#'
#' @import RSelenium

scraper <-
  function(css_query_entry, remDr) {
    Sys.sleep(2)
    # Extract race html options
    race_html <- extract_race_html_options(css_query_tbl = css_query_entry,
                                           remDr = remDr)
    # parse htlm page with tables into a list of xml documents
    xml_doc <- race_table_parse(race_xml_nodeset = race_html, remDr= remDr)

    # Extract tables into tibbles and assemble race_results and race_info tbls
    raw_tbls <- assemble_tbl(races_xml = xml_doc,
                             css_query_tbl = css_query_entry,
                             race_html = race_html)

    # Pre-process tables
    tbls_list <- pre_process_tbls(raw_tbls)
    # remDr$close()
    return(tbls_list)
  }

# pigeon scraper function --------------

#' Scraping function to extract data from the APRU National Database. It uses
#' functionality from the \code{furrr} package to run multiple queries in
#' parallel.
#'
#' @param query_exists bolean to tell if if a css query table already
#' exists.
#' @param sequence integer or array of integers specifying which rows of the
#' css query table should be scraped.
#'
#' This is a do all function. It creates a query table, scrapes and processes
#' data from the APRU website in parallel. It save the output in
#' \code{data/raw_data} in single \code{.rds} files by group of queries.
#'

pigeon_scraper <-
  function(query_exists, sequence) {

    cat("Checking if path data/raw_data exists
        if not, one will be created \n")
    if(dir.exists(here::here("inst", "raw_data")) == FALSE) {
      dir.create(here::here("inst", "raw_data"), recursive = TRUE)
    }

    start_chrome_remDr(kill = FALSE)

    remDr <- connect_remDr()
    remDr$open(silent = TRUE)
    remDr_go_to_link(remDr = remDr, link = "https://pigeon-ndb.com/races/")

    if(query_exists == FALSE) {
      cat("building css query \n")
      css_query_tbl <- pigeon_query_builder(remDr = remDr)

      cat("saving css_query_tbl as .rds \n")
      saveRDS(css_query_tbl, here::here("inst", "css_query", "css_query_tbl.rds"))
    } else (
      css_query_tbl <- readRDS(
        file = here::here("inst", "css_query", "css_query_tbl.rds")
      )
    )

    cat("scraping function \n")
    if(is.null(sequence)){
      purrr::walk(
        1:nrow(css_query_tbl),
        function(i){
          print(i)
          temp_race_data  <- purrr::map(
            i,
            purrr::safely(function(g) {
              scraper(css_query_tbl[g, ], remDr)
            }))
          saveRDS(temp_race_data, file = here::here(
            "inst",
            "raw_data",
            paste0("tbl_", i, ".rds")))
          Sys.sleep(.5)
        }
      )
    } else {
      purrr::walk(
        sequence,
        function(i){
          print(i)
          temp_race_data  <- purrr::map(
            i,
            purrr::safely(function(g) {
              scraper(css_query_tbl[g, ], remDr)
            }))
          saveRDS(temp_race_data, file = here::here(
            "inst",
            "raw_data",
            paste0("tbl_", i, ".rds")))
          Sys.sleep(.5)
        }
      )
    }
    remDr$close()
  }
tijoalca/pigeonscraper documentation built on Sept. 2, 2021, 9:48 a.m.