R/get-site-data.R

Defines functions clean_row reduce_table select_table check_tables get_site_tables grep_species get_site_html clean_region_names get_region_data get_site_data

get_site_data <- function(region_urls, species = "squirrel") {
  region_url_data <- tibble::tibble(region_url = region_urls)
  site_url_nested <- dplyr::mutate(region_url_data,
                                   site_url = purrr::map(region_url, get_region_data))
  site_url_data   <- tidyr::unnest(site_url_nested, cols = site_url)

  site_data <- dplyr::mutate(site_url_data,
                             site_html      = purrr::map(site_url, get_site_html),
                             site_html_char = purrr::map_chr(site_html, as.character),
                             is_species     = purrr::map_lgl(site_html_char, grep_species,
                                                             species = species),
                             site_tables    = purrr::map(site_html, get_site_tables),
                             site_table     = purrr::map(site_tables, select_table),
                             species_row    = purrr::map(site_table, reduce_table,
                                                         species = species))

  site_data
}


get_region_data <- function(region_url) {
  html             <- xml2::read_html(region_url)
  region_name_node <- rvest::html_node(html, xpath = "//div/div/h1")
  region_name      <- rvest::html_text(region_name_node)
  site_url_nodes   <- rvest::html_nodes(html, xpath = "//div[@class='menu']/a")
  site_urls        <- rvest::html_attr(site_url_nodes, name = "href")
  site_urls_encode <- vapply(site_urls, URLencode, character(1))  # URLencode is not vectorized
  site_names       <- rvest::html_text(site_url_nodes)
  url_data         <- tibble::tibble(region_name   = clean_region_names(region_name),
                                     site_name     = trimws(site_names, which = "both"),
                                     site_url      = site_urls_encode,
                                     site_url_base = basename(site_url))

  url_data
}


clean_region_names <- function(region_names) {
  ws_removed <- trimws(region_names, which = "both")
  name_only  <- gsub("\\s.*", "", ws_removed)

  name_only
}


get_site_html <- function(site_url, sleep_interval = 0.5) {
  Sys.sleep(sleep_interval)
  site_html   <- xml2::read_html(site_url)

  site_html
}


grep_species <- function(html_char, species) {
  html_mentions_species    <- grepl(species, html_char, ignore.case = TRUE)
  needs_special_adjustment <- grepl("Matthiessen Deer/Turkey Map", html_char, fixed = TRUE)

  is_species <- html_mentions_species & !needs_special_adjustment

  is_species
}


get_site_tables <- function(site_html) {
  site_tables <- tryCatch(rvest::html_table(site_html, header = TRUE, fill = TRUE, trim = TRUE),
                          error = function(e) NULL)
  tables_checked <- if(check_tables(site_tables)) site_tables else NULL
  
  tables_checked
}


check_tables <- function(table) {
  if(!length(table)) return(FALSE)

  names_are_blank <- all(names(table) == "")
  has_few_rows    <- nrow(table) < 2
  has_no_contents <- !any(!is.na(table))
  table_check     <- !names_are_blank || !has_few_rows || !has_no_contents
  
  table_check
}


select_table <- function(tables) {
  # in the case in which there are multiple tables on a given location's page,
  # the table with more rows contains a column indicating species.
  if(length(tables) > 1) {
    row_counts <- vapply(tables, nrow, integer(1))
    max_row_ix <- which.max(row_counts)
    table      <- tables[[max_row_ix]]
  } else {
    table <- tables[[1]]
  }

  table
}


reduce_table <- function(table, species = "squirrel") {
  if(is.null(table)) return(NULL)

  col_is_species <- grepl("species|program", names(table), ignore.case = TRUE)
  if(!any(col_is_species)) return(NULL)

  species_col <- table[, col_is_species]
  species_ix  <- which(grepl(species, species_col, ignore.case = TRUE))
  species_row <- table[species_ix, ]
  reduced_row <- if(nrow(species_row) == 0) NULL else clean_row(species_row, col_is_species)
  cleaned_row <- 

  reduced_row
}


clean_row <- function(row, which_col) {
  row[, which_col] <- gsub("1.*", "", row[, which_col])
  row
}
ataustin/huntil documentation built on July 30, 2020, 3:21 a.m.