R/get_abs_xml_metadata.R
In readabs: Download and Tidy Time Series Data from the Australian Bureau of Statistics

#' @importFrom rlang .data .env
#' @import dplyr
#' @noRd
#' @param url URL for an XML file in the ABS Time Series Directory
# given a catalogue number, download the catalogue metadata via XML, then find
# unique filenames in the latest release and return those

get_abs_xml_metadata <- function(url) {
  if (!is.character(url)) {
    stop("`url` argument to get_abs_xml_metadata() must be a string.")
  }

  first_page_df <- get_first_xml_page(url)

  num_pages <- get_numpages(url)

  xml_dfs <- dplyr::tibble()

  # If there's more than one page of XML corresponding to request, get all of them
  if (num_pages > 1) {
    all_pages <- 2:num_pages
    # create list of URLs of XML metadata to scrape
    full_urls <- paste0(url, "&pg=", all_pages)
    xml_dfs <- get_xml_dfs(full_urls)
  }

  xml_dfs <- dplyr::bind_rows(first_page_df, xml_dfs)

  xml_dfs <- xml_dfs %>%
    dplyr::mutate(ProductIssue = as.Date(paste0("01 ", .data$ProductIssue),
      format = "%d %b %Y"
    )) %>%
    dplyr::mutate_at(c("ProductReleaseDate", "SeriesStart", "SeriesEnd"),
      as.Date,
      format = "%d/%m/%Y"
    )

  xml_dfs <- xml_dfs %>%
    dplyr::group_by(.data$TableTitle) %>%
    dplyr::filter(.data$ProductReleaseDate == max(.data$ProductReleaseDate)) %>%
    dplyr::ungroup()

  xml_dfs <- dplyr::mutate(xml_dfs, TableOrder = as.numeric(.data$TableOrder))
  xml_dfs <- dplyr::arrange(xml_dfs, .data$TableOrder)

  xml_dfs
}

get_numpages <- function(url) {
  temp_xml <- tempfile(fileext = ".xml")

  dl_file(
    url = url,
    destfile = temp_xml
  )

  out <- temp_xml %>%
    xml2::read_xml() %>%
    xml2::xml_find_all(xpath = "//NumPages") %>%
    xml2::xml_double()

  if (length(out) == 0) {
    return(1)
  } else {
    return(out)
  }
}

get_first_xml_page <- function(url) {
  get_specific_xml_page(
    url = url,
    page = 1
  )
}

get_last_xml_page <- function(url) {
  num_pages <- get_numpages(url)
  get_specific_xml_page(
    url = url,
    page = num_pages
  )
}

get_specific_xml_page <- function(url, page) {
  if (length(page) > 0) {
    first_url <- paste0(
      url,
      "&pg=",
      page
    )
  } else {
    first_url <- url
  }

  # Some tables in the ABS TSD start with a leading zero, as in
  # Table 01 rather than Table 1; the 0 needs to be included. Here we first test
  # for a readable XML file using the table number supplied (eg. "1"); if that
  # doesn't work then we try with a leading zero ("01").

  safely_get_xml_dfs <- purrr::safely(get_xml_dfs)
  first_page <- safely_get_xml_dfs(first_url)

  first_url_works <- if (!is.null(first_page$result)) {
    TRUE
  } else {
    FALSE
  }

  if (!first_url_works) {
    if (!grepl("ttitle", first_url)) { # this is the case when tables == "all"
      stop(paste0(
        "Cannot find valid entry",
        " in the ABS Time Series Directory.",
        "Check that the cat. no. is correct, and that it contains ",
        "time series spreadsheets (not data cubes)."
      ))
    }

    # now try prepending a 0 on the ttitle

    first_url <- gsub("ttitle=", "ttitle=0", first_url)

    first_page <- safely_get_xml_dfs(first_url)

    first_url_works <- if (!is.null(first_page$result)) {
      TRUE
    } else {
      FALSE
    }

    if (first_url_works) {
      url <- gsub("ttitle=", "ttitle=0", url)
    } else {
      stop(
        "Cannot find valid entry for requested data",
        "in the ABS Time Series Directory"
      )
    }
  }

  first_page <- first_page$result
  first_page
}

Any scripts or data that you put into this service are public.

readabs documentation built on June 8, 2025, 11:50 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

readabs
Download and Tidy Time Series Data from the Australian Bureau of Statistics

R/get_abs_xml_metadata.R
In readabs: Download and Tidy Time Series Data from the Australian Bureau of Statistics

Defines functions get_specific_xml_page get_last_xml_page get_first_xml_page get_numpages get_abs_xml_metadata

Try the readabs package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

readabs Download and Tidy Time Series Data from the Australian Bureau of Statistics

R/get_abs_xml_metadata.R In readabs: Download and Tidy Time Series Data from the Australian Bureau of Statistics

Defines functions get_specific_xml_page get_last_xml_page get_first_xml_page get_numpages get_abs_xml_metadata

Try the readabs package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

readabs
Download and Tidy Time Series Data from the Australian Bureau of Statistics

R/get_abs_xml_metadata.R
In readabs: Download and Tidy Time Series Data from the Australian Bureau of Statistics