R/css_query_build.R

Defines functions extract_orgs extract_years

Documented in extract_orgs extract_years

# Create css queries tables ----------------
# To scrape the data I first created functions that extract the needed queries
# from the website. These will be used by the selenium functions to instruct our
# server to click through on spefic fields on the website, in order to generate
# the tables we wanto to scrape.

#' Extract css queries for race years
#'
#' Given a parsed html, it will scrape and return a list of css queries to click
#' throught the \code{Years} field.
#'
#' @param parsed_html pardsed html page using \code{\link{get_page_source}}.
#'
#' @return tibble with css queries and relevant info.
#'
#' @import dplyr
#' @import rlang

extract_years <-
  function(parsed_html) {
    . <- NULL
    years_tib <- parsed_html %>%
      #load the html that is opened on the remote desktop
      # select node I need insert options
      rvest::html_nodes(c("#select-year")) %>%
      # load options
      rvest::html_children() %>%
      # parse them into text
      rvest::html_text() %>%
      tibble::tibble(
        year = .
      ) %>%
      # add list position
      # Create js queries using css selector to be used with Rselenium
      mutate(
        list_position = 1:n(),
        year_query = stringr::str_c(
          "#select-year > option:nth-child(",
          .data$list_position,
          ")")
      )

    # test if segment is loaded
    return(years_tib)
  }

# Extract queries for organizations -----

#' Extract queries for organizations
#'
#' Fuction that clicks on each year and extracts the queries to click on each
#' organization which has organized races.
#'
#' @param years tibble or data.drame with year information and year css queries generated by \code{\link{extract_years}}.
#' @param remDr Object class remote driver previosly connected using
#' \code{\link{connect_remDr}}.
#' @return \code{tibble} with years, organization name and css queries for both years
#' and organizations.
#'
#' @import RSelenium
#' @importFrom rlang .env
#' @importFrom rlang .data

extract_orgs <-
  function(years, remDr) {
    orgs <- purrr::map_dfr(
      1:nrow(years),
      function(y) {
        #find element to click, as specified in query on years object
        year_element <- remDr$findElement(
          using = 'css selector', years$year_query[y])
        # instruct Selenium to click on the button
        year_element$clickElement()

        #organizations by year_element
        temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
          rvest::html_nodes("#organization-selection") %>%
          rvest::html_children()
        # Test if the elements of the page have finished loading
        # Create testing object
        test_loading <- stringr::str_detect(
          rvest::html_text(temp_orgs),
          "Loading")

        # While loading is still the element being acessed, wait 3 seconds
        # if it is still the same, wait more time.
        # repeat until the while evaluation is FALSE.
        while (TRUE %in% test_loading) {
          # Wait
          Sys.sleep(3)
          # Parse the html again
          temp_orgs <-  xml2::read_html(year_element$getPageSource()[[1]]) %>%
            rvest::html_nodes("#organization-selection") %>%
            rvest::html_children()
          # generate new testing string
          test_loading <- stringr::str_detect(
            rvest::html_text(temp_orgs),
            "Loading")
        }

        # Create tibble with sotred data and css_queries to be
        # ... used to scrape individual race data
        # Attribute @data-orgnum as is unique to each organization and has
        # ... to be used only in the recent years as an unique identifier.
        temp_orgs <-
          tibble::tibble (
            year = years$year[y],
            organization = rvest::html_text(temp_orgs),
            # number of the organization indicated on the data-orgnum attribute
            org_number = rvest::html_attr(temp_orgs, name = "data-orgnum")
          ) %>%
          dplyr::mutate(
            # Transfor org number
            # non-exitent are "null", NA for first case of every year and
            org_number = dplyr::case_when(
              is.na(org_number) ~ NA_real_,
              org_number == "null" ~ NA_real_,
              TRUE ~ as.numeric(org_number)
            ),
            # List position for building queries
            list_position = 1:n(),
            # query for year
            css_query_year = years$year_query[y],
            # query for organization
            css_query_org = stringr::str_c(
              "#organization-selection > option:nth-child(",
              .data$list_position,
              ")")
          ) %>%
          # Remove line with "Click" string.
          dplyr::filter(stringr::str_detect(.data$organization, "Click", negate = TRUE)) %>%
          dplyr::select(-.data$list_position)

        return(temp_orgs)
      }
    )
    return(orgs)
  }
tijoalca/pigeonscraper documentation built on Sept. 2, 2021, 9:48 a.m.