R/ids_bulk.R

Defines functions read_excel_wrapper prompt_user download_file check_interactive process_bulk_data read_bulk_file validate_file download_bulk_file get_response_headers ids_bulk

Documented in ids_bulk

#' Download and Process Bulk Data File for International Debt Statistics
#'
#' This function downloads a data file from the World Bank International Debt
#' Statistics (IDS), reads and processes the data into a tidy format.
#'
#' @param file_url A character string specifying the URL of the Excel file to
#' download. This parameter is required (see \link{ids_bulk_files}).
#' @param file_path An optional character string specifying the file path where
#' the downloaded file will be saved. Defaults to a temporary file with `.xlsx`
#' extension. The file will automatically be deleted after processing.
#' @param quiet A logical parameter indicating whether messages should be
#' printed to the console.
#' @param timeout An integer specifying the timeout in seconds for downloading
#' the file. Defaults to the current R timeout setting.
#' @param warn_size A logical parameter indicating whether to warn about large
#' downloads. Defaults to TRUE.
#'
#' @return A tibble containing processed debt statistics data with the following
#' columns:
#' \describe{
#'   \item{entity_id}{The unique identifier for the entity (e.g., "ZMB").}
#'   \item{series_id}{The unique identifier for the series (e.g.,
#'                    "DT.DOD.DPPG.CD").}
#'   \item{counterpart_id}{The unique identifier for the counterpart series.}
#'   \item{year}{The year corresponding to the data (as an integer).}
#'   \item{value}{The numeric value representing the statistic for the given
#'                entity, series, counterpart, and year.}
#' }
#'
#' @export
#' @examplesIf curl::has_internet() && rlang::is_installed("readxl")
#' \dontrun{
#' available_files <- ids_bulk_files()
#' data <- ids_bulk(
#'   available_files$file_url[1]
#' )
#' }
#'
ids_bulk <- function(
  file_url,
  file_path = tempfile(fileext = ".xlsx"),
  quiet = FALSE,
  timeout = getOption("timeout", 60),
  warn_size = TRUE
) {
  rlang::check_installed("readxl", reason = "to download bulk files.")

  on.exit(unlink(file_path))

  download_bulk_file(file_url, file_path, timeout, warn_size, quiet)

  if (!quiet) {
    cli::cli_progress_message("Reading in file.")
  }
  bulk_data <- read_bulk_file(file_path)

  if (!quiet) {
    cli::cli_progress_message("Processing file.")
  }
  bulk_data <- process_bulk_data(bulk_data)

  bulk_data
}

#' Get response headers from a URL
#'
#' @param file_url URL to request headers from
#' @return List of response headers
#'
#' @keywords internal
#' @noRd
get_response_headers <- function(file_url) {
  httr2::request(file_url) |>
    httr2::req_headers("Accept" = "*/*") |>
    httr2::req_method("HEAD") |>
    httr2::req_perform() |>
    httr2::resp_headers()
}

#' Download bulk data file with validation
#'
#' @param file_url URL of the file to download
#' @param file_path Path where file should be saved
#' @param timeout Timeout in seconds
#' @param warn_size Whether to warn about large files
#' @param quiet Whether to suppress messages
#'
#' @keywords internal
#' @noRd
#'
download_bulk_file <- function(file_url, file_path, timeout, warn_size, quiet) {
  # Error on response mime type mismatch (esp. HTML instead of Excel)
  response_headers <- get_response_headers(file_url)
  mime_type <- "application/octet-stream"

  if (response_headers$`content-type` != mime_type) {
    cli::cli_abort(
      paste0(
        "Request returned an invalid file type. ",
        "Please check the URL and try again."
      )
    )
  }

  size_mb <- as.numeric(response_headers$`content-length`) / 1024^2
  formatted_size <- format(round(size_mb, 1), nsmall = 1) # nolint

  if (warn_size && size_mb > 100) {
    cli::cli_alert_warning(paste0(
      "This file is {formatted_size} MB and may take several minutes to ",
      "download. Current timeout setting: {timeout} seconds. Use ",
      "{.code warn_size = FALSE} to disable this warning."
    ))

    if (warn_size && check_interactive()) {
      response <- prompt_user(
        "Do you want to continue with the download? (y/N): "
      )
      if (!tolower(response) %in% c("y", "yes")) {
        cli::cli_abort("Download cancelled by user")
      }
    }
  }

  if (!quiet) {
    cli::cli_progress_message("Downloading file to: {file_path}")
  }

  # nocov start
  withr::with_options(
    list(timeout = timeout),
    tryCatch(
      {
        download_file(file_url, destfile = file_path, quiet = quiet)
      },
      error = function(e) {
        if (grepl("timeout|cannot open URL", e$message, ignore.case = TRUE)) {
          cli::cli_abort(
            paste0(
              "Download timed out after ",
              timeout,
              " seconds.\n",
              "Try increasing the timeout parameter",
              " (e.g., timeout=600 for 10 minutes)"
            )
          )
        }
        cli::cli_abort(e$message)
      }
    )
  )
  # nocov end
  validate_file(file_path)
}

#' Validate that downloaded file exists and is not empty
#'
#' @param file_path Path to file to validate
#' @noRd
#'
validate_file <- function(file_path) {
  if (!file.exists(file_path)) {
    cli::cli_abort("Download failed: File not created")
  }
  if (file.size(file_path) == 0) {
    unlink(file_path)
    cli::cli_abort("Download failed: Empty file")
  }
}

#' Read bulk file and determine column types
#'
#' @param file_path Path to Excel file
#' @return Raw data frame from Excel file
#' @noRd
#'
read_bulk_file <- function(file_path) {
  # Read in first row of Excel file to get column names
  header_row <- read_excel_wrapper(path = file_path, n_max = 0)

  # Get all column names
  available_columns <- header_row |>
    colnames()

  # Initialize a helper tibble mapping relevant column names to types
  relevant_columns <- tibble::tibble(names = available_columns) |>
    # Drop column names that contain "column" (which are empty in the bulk file)
    filter(!grepl("column", .data$names, ignore.case = TRUE)) |>
    # Year columns are numeric, all others are text
    dplyr::mutate(
      types = dplyr::if_else(
        grepl(pattern = "[0:9]", .data$names),
        "numeric",
        "text"
      )
    )

  # Read in the data from the Excel file for only the relevant columns
  read_excel_wrapper(
    path = file_path,
    range = readxl::cell_cols(seq_len(nrow(relevant_columns))),
    col_types = relevant_columns$types
  )
}

#' Process bulk data into tidy format
#'
#' @param bulk_raw Raw data frame from Excel file
#' @return Processed tibble in tidy format
#' @noRd
#'
process_bulk_data <- function(bulk_raw) {
  bulk_raw |>
    # Select only the relevant columns
    dplyr::select(
      "Country Code",
      "Series Code",
      "Counterpart-Area Code",
      dplyr::matches("^\\d{4}$")
    ) |>
    # Rename columns to match the package data model
    select(
      entity_id = "Country Code",
      series_id = "Series Code",
      counterpart_id = "Counterpart-Area Code",
      dplyr::everything()
    ) |>
    # Pivot to long (tidy) format
    tidyr::pivot_longer(
      cols = -c("entity_id", "series_id", "counterpart_id"),
      names_to = "year"
    ) |>
    # Convert year to integer
    dplyr::mutate(year = as.integer(.data$year)) |>
    # Drop rows with NA values
    tidyr::drop_na()
}

#' Check if R is running interactively
#'
#' Wrapper around base::interactive() to make the function testable.
#' This function exists primarily to facilitate testing of interactive features.
#'
#' @return Logical indicating whether R is running interactively
#' @keywords internal
#' @noRd
#'
check_interactive <- function() {
  interactive()
}

#' Download a file using utils::download.file
#'
#' Wrapper around utils::download.file to facilitate testing.
#'
#' @param url URL of file to download
#' @param destfile Destination file path
#' @param quiet Whether to suppress messages
#' @return Invisibly returns the status code from download.file
#' @keywords internal
#' @noRd
#'
download_file <- function(url, destfile, quiet) {
  utils::download.file(url, destfile = destfile, quiet = quiet, mode = "wb")
}

#' Prompt a user with a question
#'
#' Wrapper around base::readline to facilitate testing. Cannot be tested
#' because of the base binding.
#'
#' @keywords internal
#' @noRd
#'
prompt_user <- function(prompt) {
  readline(prompt) # nocov
}

#' Read an Excel file using readxl
#'
#' Wrapper around readxl::read_excel to facilitate testing without mocking
#' external package namespaces.
#'
#' @param ... Arguments passed to readxl::read_excel
#' @return Data frame from Excel file
#' @keywords internal
#' @noRd
#'
read_excel_wrapper <- function(...) {
  readxl::read_excel(...)
}

Try the wbids package in your browser

Any scripts or data that you put into this service are public.

wbids documentation built on Dec. 20, 2025, 9:06 a.m.