R/parse_races.R

Defines functions pre_process_tbls assemble_tbl race_table_parse extract_race_html_options

Documented in assemble_tbl extract_race_html_options pre_process_tbls race_table_parse

# Parse html source with race tables ------------------

#' Parse html source with race tables
#'
#' @param css_query_tbl table with year and organization css queries, generated
#' by the function \code{\link{extract_orgs}}.
#' @param remDr Object class remote driver previosly connected using
#' \code{\link{connect_remDr}}.
#'
#' @return xml_nodeset of parsed html options to be used to build queries.
#'
#' @import RSelenium
#' @import rlang

extract_race_html_options <-
  function(css_query_tbl, remDr) {
    year_element <- remDr$findElement(
      using = "css selector",
      value = css_query_tbl$css_query_year)
    # Click year element
    year_element$clickElement()

    temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
      rvest::html_nodes("#organization-selection") %>%
      rvest::html_children()

    #### Check if page is loaded ####

    # Test if the elements of the page have finished loading
    # Create testing object
    test_loading <- stringr::str_detect(rvest::html_text(temp_orgs),"Loading")

    # While loading is still the element being acessed, wait 3 seconds
    # if it is still the same, wait more time.
    # repeat until the while evaluation is FALSE.
    while (TRUE %in% test_loading) {
      # Wait
      Sys.sleep(1)
      # Parse the html again
      temp_orgs <- xml2::read_html(year_element$getPageSource()[[1]]) %>%
        rvest::html_nodes("#organization-selection") %>%
        rvest::html_children()
      # generate new testing string
      test_loading <- stringr::str_detect(
        rvest::html_text(temp_orgs), "Loading")
    }

    #### Organizations query ####

    org_element <- remDr$findElement(
      using = "css selector",
      value = css_query_tbl$css_query_org)

    org_element$clickElement()

    temp_races <- xml2::read_html(org_element$getPageSource()[[1]]) %>%
      rvest::html_nodes("#race-selection") %>%
      rvest::html_children()

    #### Test if page is loaded ####

    test_loading <- stringr::str_detect(
      rvest::html_text(temp_races),"Loading")

    # While loading is still the element being acessed, wait 3 seconds
    # if it is still the same, wait more time.
    # repeat until the while evaluation is FALSE.
    while (TRUE %in% test_loading) {
      # Wait
      Sys.sleep(1)
      # Parse the html again
      temp_races <- xml2::read_html(org_element$getPageSource()[[1]]) %>%
        rvest::html_nodes("#race-selection") %>%
        rvest::html_children()
      # generate new testing string
      test_loading <- stringr::str_detect(
        rvest::html_text(temp_races),"Loading")
    }
    return(temp_races)
  }

# Parse source html with race tables -----

#' Parsing html tables for each individual race in a xml_nodeset.
#'
#' @param race_xml_nodeset xml_nodeset generated by
#' \code{\link{extract_race_html_options}}.
#' @param remDr Object class remote driver previosly connected using
#'
#' @return XML document with race tables.
#'
#' @import RSelenium
#' @import rlang

race_table_parse <-
  function(race_xml_nodeset, remDr) {
    . <- NULL

    temp_race <- purrr::map(
      1:length(race_xml_nodeset),
      function(race) {
        css_query_race <- stringr::str_c(
          "#race-selection > option:nth-child(",
          race,
          ")")
        race_element <- remDr$findElement(using = "css selector",
                                          value = css_query_race)
        race_element$clickElement()
        Sys.sleep(1)

        temp_race <- xml2::read_html(race_element$getPageSource()[[1]])

        #### Check if table is loaded ####

        # Check if table is loaded
        test_loaded_table <-
          temp_race %>%
          rvest::html_node("table") %>%
          rvest::html_table(fill = TRUE) %>%
          .[[1]] %>%
          tibble::as_tibble()

        # Some tables will have no data.
        # So, I created a timeout for this test.
        # After 5 seconds of wating there is no info on the table
        # we assume that is the case and move on.
        trial <- 1

        while(nrow(test_loaded_table) == 0 & trial < 5) {
          Sys.sleep(1)
          temp_race <- xml2::read_html(race_element$getPageSource()[[1]])
          test_loaded_table <-
            temp_race %>%
            rvest::html_node("table") %>%
            rvest::html_table(fill = TRUE) %>%
            .[[1]] %>%
            tibble::as_tibble()
          trial = trial + 1;
        } # while loop

        return(temp_race)
      } # map function
    ) # map
  }

# Assemble race tables from xml documents -----

#' Assembre race tables from xml documents
#'
#' @param races_xml xml documents with race tables geneerated from
#' \code{\link{race_table_parse}}.
#' @param css_query_tbl individual css_query table used to access the
#'  individual race
#' @param race_html xml_nodeset of parsed html race options. Used to extract
#' accurate information about race locations and start time.
#'
#' @return \code{list} with two tables: \code{race_results}
#' and \code{race_info}
#'
#' @importFrom rlang .data
#' @importFrom stringr fixed
#' @importFrom stringr regex

assemble_tbl <- function(
  races_xml,
  css_query_tbl,
  race_html) {
  . <- NULL

  seq_xml_docs <- 1:length(races_xml)

  temp_tidy_tbls <- purrr::map(
    seq_xml_docs,
    function(i) {
      # i = 1
      race <- races_xml[[i]]


      ### Solution for 0 valued table
      ### To me, the solution will to check if this is 0 valued table.
      ### If so, return empty tables with race_id and NA for all the rest.
      ### Might be important for the acount of how many races there are by
      ### Organization

      #### create race_results table ####
      race_results_tbl <-   race %>%
        rvest::html_nodes("table") %>%
        rvest::html_table(fill = TRUE)  %>%
        .[[1]]   %>%
        tibble::as_tibble()

      if (nrow(race_results_tbl) > 0) {
        race_results_tbl <-
          race_results_tbl %>%
          dplyr::mutate(
            n = 1:n()
          )

        ##### Test if table contains summary rows ####
        # some tables will have summary rows for best performing birds by loft.
        # below I test if this is the case and if so, remove the corresponding rows
        # ... and the ones bellow it.

        test_below <-
          race_results_tbl %>%
          dplyr::filter(stringr::str_detect(.data$Pos, "Below"))


        if (test_below %>% tally > 0) {
          limit = test_below$n - 1
          race_results_tbl <-
            race_results_tbl %>%
            dplyr::filter(between(row_number(), 1, limit))
        }
        ### Create race_info table ####
        # this is a mixture of info present at:
        # .... - the css_query_css_query_tbl object
        # .... - scrapping info from the race specific headed on the website
        # For both the css paths and xpath I used Firefox developers tools
        # I also create a unique key for each race based on
        # ... the race organization, year and the race list position

      } else {
        race_results_tbl <- race_results_tbl[1,]

        race_results_tbl <-
          race_results_tbl %>%
          dplyr::mutate_all(as.character) %>%
          dplyr::mutate(n = NA_character_)
      }


      race_info_tbl <-
        tibble::tibble(
          raw_info = race_html[[i]] %>%
            rvest::html_text(),
          year = css_query_tbl$year,
          organization = css_query_tbl$organization,
          org_number = css_query_tbl$org_number,
          date = rvest::html_nodes(race, "#race-selection") %>%
            rvest::html_children() %>%
            rvest::html_attr("data-date") %>%
            .[i] %>%
            lubridate::mdy(),
          raw_location =  rvest::html_nodes(race, "#race-selection") %>%
            rvest::html_children() %>%
            rvest::html_attr("data-id") %>%
            .[i],
          raw_release_weather =
            rvest::html_node(
              race,
              css = paste(
                ".race-results > div:nth-child(1)",
                "> div:nth-child(1) > div:nth-child(2)",
                sep = "")
            ) %>%
            rvest::html_text(),
          # html is defected, have to copy the whole file and fix latter
          raw_arrival_weather = rvest::html_node(
            race,
            xpath = "/html/body/div[3]/div[2]/div/div") %>%
            rvest::html_text(),
          raw_text = rvest::html_node(
            race,
            xpath = "/html/body/div[3]/div[2]"
          ) %>%
            rvest::html_text(),
          # race_id =  stringr::str_c(
          #   .data$organization %>%
          #     stringr::str_replace_all(
          #       fixed(" "),
          #       replacement = "_") %>%
          #     stringr::str_replace_all(
          #       fixed("_-_"),
          #       replacement = "_") %>%
          #     stringr::str_to_lower(),
          #   "_race_",
          #   i
          # )
          race_id = stringr::str_c(
            "y",
            stringr::str_extract(css_query_tbl$css_query_year, regex("\\d{1,}")),
            "o",
            stringr::str_extract(css_query_tbl$css_query_org, regex("\\d{1,}")),
            "r",
            i
          )
        ) %>%
        dplyr::select(.data$race_id, everything())

      #### Add unique identifier to the restuls table ####
      race_results_tbl <-
        race_results_tbl %>%
        dplyr::mutate(race_id = race_info_tbl$race_id) %>%
        dplyr::select(.data$race_id, everything())

      #### Generate output####
      output <- list(
        "race_results_tbl" = race_results_tbl,
        "race_info_tbl" = race_info_tbl
      )
      return(output)
    })
  return(temp_tidy_tbls)
}

# Table pre-processing -------


#' Table pre-process and aggregation by organization and by year
#'
#' @param tbls_list list of tables generated from \code{\link{assemble_tbl}}.
#'
#' @return list of tables by year and organization containing both iformation on
#' individual races and results for each pigeon entry in an individual race.
#'
#' @importFrom rlang .data

pre_process_tbls <- function(tbls_list) {
  . <- NULL
  tbls <-
    list(
      ##### Bind rows for race_info ####
      "race_info" = purrr::map_dfr(
        tbls_list,
        function(i) {
          i$race_info_tbl
        }),
      ##### Pre-process and bind rows for race_results ####
      "race_results" = purrr::map(
        tbls_list,
        function(i) {
          temp_tbl <- i$race_results_tbl

          temp_tbl <- temp_tbl %>%
            dplyr::mutate_all (as.character)

          temp_tbl <- temp_tbl %>%
            dplyr::arrange(.data$race_id) %>%
            dplyr::rename(
              to_win = .data$`To Win`,
              ndb_points = .data$`WS Points`
            )

          temp_tbl <- temp_tbl %>%
            dplyr::mutate(
              Pos = as.integer(.data$Pos),
              Section = dplyr::case_when(
                .data$Section == "" ~ NA_character_,
                .data$Section == "NA" ~ NA_character_,
                TRUE ~ .data$Section
              ),
              # Arrival = Arrival %>% lubridate::hms()
              Miles = as.numeric(.data$Miles),
              # to_win = as.numeric(to_win),
              YPM = as.numeric(.data$YPM)
              )

          temp_tbl <- temp_tbl %>%
            dplyr::select(-.data$n, -.data$`NDB Points`)

          return(temp_tbl)
        }) %>% do.call("rbind", .)
    )
  return(tbls)
}
tijoalca/pigeonscraper documentation built on Sept. 2, 2021, 9:48 a.m.