R/BigDataPE.R

Defines functions bdpe_fetch_chunks bdpe_fetch_data bdpe_list_tokens bdpe_remove_token bdpe_get_token bdpe_store_token parse_queries

Documented in bdpe_fetch_chunks bdpe_fetch_data bdpe_get_token bdpe_list_tokens bdpe_remove_token bdpe_store_token parse_queries

#' Constructs a URL with query parameters
#'
#' This function appends a list of query parameters to a base URL.
#'
#' @param url The base URL to which query parameters will be added.
#' @param query_list A list of query parameters to be added to the URL.
#' @return The complete URL with the query parameters appended.
#' @examples
#' parse_queries("https://www.example.com", list(param1 = "value1", param2 = "value2"))
#' @export
parse_queries <- function(url, query_list) {
  if (length(query_list) == 0) {
    return(url) # Return the base URL if no queries are provided
  }

  # Filter out parameters with empty values
  query_list <- query_list[query_list != ""]

  # If no valid parameters remain, return the base URL
  if (length(query_list) == 0) {
    return(url)
  }

  # Encode parameters and construct the query string
  query_string <- paste(
    sapply(names(query_list), function(name) {
      paste0(
        utils::URLencode(name, reserved = TRUE), "=", utils::URLencode(as.character(query_list[[name]]), reserved = TRUE)
      )
    }),
    collapse = "&"
  )

  # Append the query string to the URL
  paste0(url, "?", query_string)
}



#' Store a token in an environment variable for a specific dataset
#'
#' This function stores an authentication token for a specific dataset
#' in a system environment variable. The environment variable name is
#' constructed by converting the dataset name to ASCII (removing accents),
#' replacing spaces with underscores, and prefixing it with "BigDataPE_".
#'
#' If a variable with that name already exists (and is non-empty), the function
#' will stop and notify you to avoid overwriting. Adjust this behavior as needed.
#'
#' @param base_name The name of the dataset (character).
#' @param token The authentication token for the dataset (character).
#'
#' @return No return value, called for side effects.
#' @examples
#' bdpe_store_token("educação dataset", "your-token-here")
#'
#' @export
bdpe_store_token <- function(base_name, token) {
  # 1. Validate inputs
  if (!is.character(base_name) || !nzchar(base_name)) {
    stop("Dataset name must be a valid string.")
  }
  if (!is.character(token) || !nzchar(token)) {
    stop("Token must be a valid string.")
  }

  # 2. Convert the dataset name to ASCII (removing accents), then replace spaces with underscores
  base_name_ascii <- iconv(base_name, from = "", to = "ASCII//TRANSLIT")
  base_name_sanitized <- gsub(" ", "_", base_name_ascii)

  # 3. Construct the environment variable name
  env_var_name <- paste0("BigDataPE_", base_name_sanitized)

  # 4. Check if the environment variable is already in use
  if (nzchar(Sys.getenv(env_var_name, unset = ""))) {
    # Stop to avoid overwriting an existing token; change to warning or message as needed
    message(
      "The environment variable '", env_var_name,
      "' is already defined. Not overwriting to avoid data loss."
    )
    return(invisible())
  }

  # 5. Store the token in the environment variable
  env_list <- list(token)
  names(env_list) <- env_var_name
  do.call(Sys.setenv, env_list)

  message("Token stored in environment variable: ", env_var_name)
}

#' Retrieve the token associated with a specific dataset
#'
#' This function retrieves the authentication token stored
#' in an environment variable for a specific dataset. If the token is not found,
#' it returns `NULL` and prints a message instead of throwing an error.
#'
#' @param base_name The name of the dataset (character).
#'
#' @return A string containing the authentication token, or `NULL` if the token is not found.
#' @examples
#' token <- bdpe_get_token("education_dataset")
#'
#' @export
bdpe_get_token <- function(base_name) {
  # 1. Validate the base_name
  if (!is.character(base_name) || !nzchar(base_name)) {
    stop("Dataset name must be a valid string.")
  }

  # 2. Convert the dataset name to ASCII (removing accents), then replace spaces with underscores
  base_name_ascii <- iconv(base_name, from = "", to = "ASCII//TRANSLIT")
  base_name_sanitized <- gsub(" ", "_", base_name_ascii)

  # 3. Construct the environment variable name
  env_var_name <- paste0("BigDataPE_", base_name_sanitized)

  # 4. Retrieve the token from the environment variable
  token <- Sys.getenv(env_var_name, unset = "")

  # 5. If the token is empty, return NULL with a message
  if (token == "") {
    message("No token found for dataset: ", base_name)
    return(NULL)
  }

  return(token)
}

#' Remove the token associated with a specific dataset
#'
#' This function removes the authentication token stored
#' in an environment variable for a specific dataset. If the token is not found,
#' it prints a message and does not throw an error.
#'
#' @param base_name The name of the dataset (character).
#'
#' @return No return value. If the token is found, it is removed. If not, a message is displayed.
#' @examples
#' bdpe_remove_token("education_dataset")
#'
#' @export
bdpe_remove_token <- function(base_name) {
  # 1. Validate the base_name
  if (!is.character(base_name) || !nzchar(base_name)) {
    stop("Dataset name must be a valid string.")
  }

  # 2. Convert the dataset name to ASCII (removing accents), then replace spaces with underscores
  base_name_ascii <- iconv(base_name, from = "", to = "ASCII//TRANSLIT")
  base_name_sanitized <- gsub(" ", "_", base_name_ascii)

  # 3. Construct the environment variable name
  env_var_name <- paste0("BigDataPE_", base_name_sanitized)

  # 4. Check if the variable exists (non-empty)
  if (nzchar(Sys.getenv(env_var_name, unset = ""))) {
    # If it exists, unset (remove) the variable
    Sys.unsetenv(env_var_name)
    message("Token successfully removed for dataset: ", base_name)
  } else {
    message("No token found for dataset: ", base_name)
  }
}



#' List all datasets that have stored tokens in environment variables
#'
#' This function returns a character vector of dataset names
#' that have their tokens stored in environment variables.
#' Specifically, it looks for variables that begin with the prefix
#' `"BigDataPE_"`.
#'
#' @return A character vector of dataset names with stored tokens.
#'         If no tokens are found, an empty vector is returned and
#'         a message is printed.
#'
#' @examples
#' bdpe_list_tokens()
#'
#' @export
bdpe_list_tokens <- function() {
  # 1. Retrieve all environment variables
  all_envs <- Sys.getenv()

  # 2. Filter for those that start with 'BigDataPE_'
  stored_tokens <- grep("^BigDataPE_", names(all_envs), value = TRUE)

  # 3. If no tokens found, return empty vector with a message
  if (length(stored_tokens) == 0) {
    message("No tokens found in environment variables.")
    return(character(0))
  }

  # 4. Remove prefix from variable names to extract the "sanitized" dataset name
  dataset_names <- sub("^BigDataPE_", "", stored_tokens)

  # 5. Return the dataset names
  return(dataset_names)
}


#' Fetch data from the BigDataPE API
#'
#' This function retrieves data from the BigDataPE API using securely stored tokens associated with datasets.
#' Users can specify pagination parameters (`limit` and `offset`) and additional query filters to customize the data retrieval.
#'
#' @param base_name A string specifying the name of the dataset associated with the token.
#' @param limit An integer specifying the maximum number of records to retrieve per request. Default is Inf (all records).
#'              If set to a non-positive value or `Inf`, no limit will be applied.
#' @param offset An integer specifying the starting record for the query. Default is 0.
#'               If set to a non-positive value or `Inf`, no offset will be applied.
#' @param query A named list of additional query parameters to filter the API results. Default is an empty list.
#' @param endpoint A string specifying the API endpoint URL. Default is "https://www.bigdata.pe.gov.br/api/buscar".
#' @param verbosity An integer specifying the verbosity level for the API requests.
#'                  Values are:
#'                  - `0`: No verbosity (default).
#'                  - `1`: Minimal verbosity, showing request status.
#'                  - `2`: Detailed verbosity, including request and response details.
#'
#' @return A tibble containing the data returned by the API.
#' @examples
#' \dontrun{
#' # Store a token for the dataset
#' bdpe_store_token("dengue_dataset", "token")
#'
#' # Fetch 50 records from the beginning
#' data <- bdpe_fetch_data("dengue_dataset", limit = 50)
#'
#' # Fetch records with additional query parameters
#' data <- bdpe_fetch_data("dengue_dataset", query = list(field = "value"))
#'
#' # Fetch all data without limits
#' data <- bdpe_fetch_data("dengue_dataset", limit = Inf)
#' }
#' @export
bdpe_fetch_data <- function(
    base_name,
    limit = Inf,
    offset = 0L,
    query = list(),
    verbosity = 0L,
    endpoint = "https://www.bigdata.pe.gov.br/api/buscar") {

  # Retrieve the token for the specified dataset
  token <- bdpe_get_token(base_name)

  # Input validation
  stopifnot(is.list(query))
  stopifnot(is.integer(offset))
  stopifnot(is.integer(limit) || is.infinite(limit))

  # Adjust limit and offset for the request
  if (is.infinite(limit) || limit <= 0) limit <- ""
  if (is.infinite(offset) || offset <= 0) offset <- ""

  # Build and send the API request
  req <- endpoint |>
    parse_queries(query_list = query) |>
    httr2::request() |>
    httr2::req_headers(
      Authorization = token,
      limit = limit,
      offset = offset
    )

  # Perform the request and parse the response
  res <- req |>
    httr2::req_perform(verbosity = verbosity) |>
    httr2::resp_body_json(simplifyVector = TRUE) |>
    tibble::as_tibble()

  return(res)
}


#' Fetch data from the BigDataPE API in chunks
#'
#' This function retrieves data from the BigDataPE API iteratively in chunks.
#' It uses `bdpe_fetch_data` as the base function and supports limits for the
#' total number of records to fetch and the size of each chunk.
#'
#' @param base_name A string specifying the name of the dataset associated with the token.
#' @param total_limit An integer specifying the maximum number of records to fetch. Default is Inf (all available data).
#' @param chunk_size An integer specifying the number of records to fetch per chunk. Default is 50000
#' @param query A named list of additional query parameters to filter the API results. Default is an empty list.
#' @param endpoint A string specifying the API endpoint URL. Default is "https://www.bigdata.pe.gov.br/api/buscar".
#' @param verbosity An integer specifying the verbosity level for the API requests.
#'                  Values are:
#'                  - `0`: No verbosity (default).
#'                  - `1`: Minimal verbosity, showing request status.
#'                  - `2`: Detailed verbosity, including request and response details.
#' @return A tibble containing all the data retrieved from the API.
#' @examples
#' \dontrun{
#' # Store a token for the dataset
#' bdpe_store_token("dengue_dataset", "token")
#'
#' # Fetch up to 500 records in chunks of 100
#' data <- bdpe_fetch_chunks("dengue_dataset", total_limit = 500, chunk_size = 100)
#'
#' # Fetch all available data in chunks of 200
#' data <- bdpe_fetch_chunks("dengue_dataset", chunk_size = 200)
#' }
#' @export
bdpe_fetch_chunks <- function(
    base_name,
    total_limit = Inf,
    chunk_size = 500000L,
    query = list(),
    verbosity = 0L,
    endpoint = "https://www.bigdata.pe.gov.br/api/buscar") {

  # Input validation
  stopifnot(is.integer(total_limit) || is.infinite(total_limit))
  stopifnot(is.integer(chunk_size) && chunk_size > 0)
  stopifnot(is.list(query))

  # Initialize variables
  offset <- 0L
  total_fetched <- 0L
  all_data <- list()

  # Fetch data in chunks
  repeat {
    # Calculate the limit for the current chunk
    current_limit <- as.integer(min(chunk_size, total_limit - total_fetched))

    #if(verbosity > 0) message("Current limit: ", current_limit)

    # Break if no more records are needed
    if (current_limit <= 0) break

    # Fetch the current chunk
    chunk <- bdpe_fetch_data(
      base_name = base_name,
      limit = current_limit,
      offset = offset,
      query = query,
      verbosity = verbosity,
      endpoint = endpoint
    )
    nms <- names(chunk)

    #if(verbosity > 0)
    #  chunk |> glimpse()

    if ("Mensagem" %in% nms)
      chunk <- dplyr::select(chunk, -"Mensagem")

    # Stop if the API returns no data
    if (nrow(chunk) == 0L) break

    # Append the chunk to the results
    all_data <- append(all_data, list(chunk))
    total_fetched <- as.integer(total_fetched) + nrow(chunk)

    # Update the offset for the next chunk
    offset <- as.integer(offset) + nrow(chunk)

    # Break if the total limit is reached
    if (total_fetched >= total_limit) break
  }

  # Combine all chunks into a single tibble
  combined_data <- dplyr::bind_rows(all_data)
  return(combined_data)
}

Try the BigDataPE package in your browser

Any scripts or data that you put into this service are public.

BigDataPE documentation built on April 4, 2025, 4:47 a.m.