R/scrape_table_list.R

Defines functions note_readable scrape_indiv_table_list scrape_table_list

Documented in scrape_table_list

#' Scrape the RBA site to obtain links to tables
#' @return A tibble containing the text and URL of XLS/XLSX links
#' @param cur_hist "current",  "historical", or "all"
#' @keywords internal
scrape_table_list <- function(cur_hist = "all") {
  if (cur_hist %in% c("current", "historical")) {
    table_list <- scrape_indiv_table_list(cur_hist = cur_hist)
  } else if (cur_hist == "all") {
    table_list <- purrr::map_dfr(
      .x = c("current", "historical"),
      .f = scrape_indiv_table_list
    )
  } else {
    stop("cur_hist must be 'current', 'historical', or 'all'.")
  }
  table_list <- note_readable(table_list)
  table_list
}


#' Scrape a list of RBA tables.
#' Not intended to be called directly - called from
#' `scrape_table_list()`
#' @noRd
scrape_indiv_table_list <- function(cur_hist = "current") {
  if (cur_hist == "current") {
    table_url <- "https://www.rba.gov.au/statistics/tables/"
    css_selector <- "#tables-list li a"
  } else if (cur_hist == "historical") {
    table_url <- "https://www.rba.gov.au/statistics/historical-data.html"
    css_selector <- ".width-text li a"
  }

  table_page <- safely_read_html(url = table_url)

  link_list <- rvest::html_nodes(table_page, css_selector)

  link_list <- link_list[grepl("xls", link_list, fixed = TRUE)]

  excel_links <- rvest::html_attr(link_list, "href")

  excel_text <- rvest::html_text(link_list, trim = TRUE)

  stopifnot(identical(length(excel_links), length(excel_text)))

  emdash <- "\u2013"
  regex_string <- paste0(emdash, "(?![^", emdash, "]*", emdash, ")")

  # Some historical tables don't have a table number; we add one
  if (cur_hist == "historical") {
    excel_text <- dplyr::case_when(
      grepl(paste0("Exchange Rates"), excel_text) &
        grepl("Daily", excel_text) &
        grepl("Current", excel_text) ~
      paste0(
        excel_text, " ", emdash, " ",
        "ex_daily_",
        stringr::str_sub(excel_text, -13, -12),
        "cur"
      ),
      grepl(paste0("Exchange Rates ", emdash, " Daily"), excel_text) ~
      paste0(
        excel_text, " ", emdash, " ",
        "ex_daily_",
        stringr::str_sub(excel_text, -10, -9),
        stringr::str_sub(excel_text, -2, -1)
      ),
      grepl(paste0("Exchange Rates ", emdash, " Monthly"), excel_text) &
        grepl("current", excel_text) ~
      paste0(excel_text, " ", emdash, " ", "ex_monthly_10cur"),
      grepl(paste0("Exchange Rates ", emdash, " Monthly"), excel_text) &
        grepl("1969", excel_text) ~
      paste0(excel_text, " ", emdash, " ", "ex_monthly_6909"),
      TRUE ~ excel_text
    )
  }

  table_list <- dplyr::tibble(
    title = excel_text,
    url = paste0("https://www.rba.gov.au", excel_links)
  )

  table_list <- table_list %>%
    tidyr::separate(.data$title,
      into = c("title", "no"),
      sep = regex_string,
      fill = "right"
    ) %>%
    dplyr::mutate(dplyr::across(
      c("title", "no"),
      stringr::str_trim
    )) %>%
    dplyr::filter(
      !is.na(.data$no),
      !grepl("Occasional Paper", excel_text),
      !grepl("Download", excel_text)
    )

  table_list$current_or_historical <- cur_hist

  table_list
}

#' Indicate tables that can't be read by `read_rba()`
#' Some of these are non-time series; others are very old and formatted in a
#' non-standard way. For now these are hard-coded into this function; in
#' future I will work on programmatically recognising readable tables.
#' @param table_list A dataframe generated by `scrape_indiv_table_list()`
#' @noRd
#' @keywords internal

note_readable <- function(table_list) {
  dplyr::mutate(table_list,
    readable =
      dplyr::case_when(
        .data$current_or_historical == "current" &
          no %in% c("E3", "E4", "E5", "E6", "E7") ~ FALSE,
        .data$current_or_historical == "historical" &
          no %in% c(
            "A3", "J1", "J2", "E4", "E5", "E6", "E7", "F16", "F17"
          ) ~ FALSE,
        TRUE ~ TRUE
      )
  )
}

Try the readrba package in your browser

Any scripts or data that you put into this service are public.

readrba documentation built on Aug. 13, 2023, 9:06 a.m.