R/collect_occurrences.R
In galah: Biodiversity Data from the GBIF Node Network

#' workhorse function to get occurrences from an atlas
#' @noRd
#' @param .query an object of class `data_response`, created using 
#' `compute.data_request()`
#' @param wait logical; should we ping the API until successful? Defaults to 
#' FALSE
#' @keywords Internal
#' @importFrom potions pour
#' @importFrom rlang abort
#' @importFrom rlang inform
#' @importFrom tibble tibble
collect_occurrences <- function(.query, wait, file = NULL){
  switch(pour("atlas", "region"),
         "United Kingdom" = collect_occurrences_uk(.query, file = file),
         collect_occurrences_default(.query, wait = wait, file = file))
}

#' Internal function to `collect_occurrences()` for UK
#' @noRd
#' @keywords Internal
collect_occurrences_uk <- function(.query, file){
  .query$download <- TRUE
  .query$file <- check_download_filename(file)
  query_API(.query)
  result <- load_zip(.query$file)
  if(is.null(result)){
    inform("Download failed")
    return(tibble())
  }else{
    result |>
      enforce_field_names_and_types(.query)
  }
}

#' Internal function to `collect_occurrences()` for living atlases
#' @noRd
#' @keywords Internal
collect_occurrences_default <- function(.query, wait, file){
  # check queue
  download_response <- check_queue(.query, wait = wait)
  if(is.null(download_response)){
    abort("No response from selected atlas")
  }
  # get data
  if(pour("package", "verbose", .pkg = "galah") &
     download_response$status == "complete") {
    inform("Downloading")
  }
  # sometimes lookup info critical, but not others - unclear when/why!
  if(any(names(download_response) == "download_url")){
    new_object <- list(url = download_response$download_url,
                       download = TRUE)
    new_object$file <- check_download_filename(file)
    query_API(new_object)
    result <- load_zip(new_object$file)
  }else{
    return(download_response) 
  }
  # handle result
  if(is.null(result)){
    inform("Download failed")
    return(tibble())
  }else{
    result <- result |>
      enforce_field_names_and_types(.query) |>
      check_media_cols()  # check for, and then clean, media info
    # exception for GBIF to ensure DOIs are preserved
    if(!is.null(.query$doi)){
      attr(result, "doi") <- paste0("https://doi.org/", .query$doi)
    }
    if(!is.null(.query$search_url)){
      attr(result, "search_url") <- .query$search_url
    }
    result
  }
}

#' Subset of collect for a doi.
#' @param .query An object of class `data_request`
#' @noRd
#' @keywords Internal
#' @importFrom potions pour
#' @importFrom rlang abort
#' @importFrom rlang inform
#' @importFrom tibble tibble
collect_occurrences_doi <- function(.query, 
                                    file = NULL, 
                                    error_call = caller_env()) {
  .query$file <- check_download_filename(file)
  query_API(.query)
  result <- load_zip(.query$file)
  if(is.null(result)){
    inform("Download failed.")
    tibble()
  }else{
    result
  }
}

#' Ensure that names given in `select` are returned
#' Note that `fields` and `assertions` are always separated
#' assertions are also handled here for efficiency reasons
#' @importFrom rlang abort
#' @importFrom rlang caller_env
#' @noRd
#' @keywords Internal
enforce_field_names_and_types <- function(df, .query){
  if(!is.null(.query$fields)){
    # get basic info
    n_fields <- length(.query$fields)
    n_cols <- ncol(df)
    # first handle field names (ensure they match those given by the user)
    if(n_fields > n_cols){
      bullets <- c("More fields were requested than are present in the download.",
                   i = "Consider using `show_all(fields)` to ensure all fields are valid.",
                   i = "Alternatively set `galah_config(run_checks = TRUE)`")
      abort(bullets, call = caller_env())
    }
    fields_not_assertions <- seq_len(n_fields)
    names(df)[fields_not_assertions] <- .query$fields
    # then assertions (replace 'true' and 'false' with boolean)
    if(n_fields < n_cols){
      assertions_not_fields <- seq((n_fields + 1), n_cols, by = 1)
      df <- fix_assertion_cols(df, names(df)[assertions_not_fields])
    }
  }
  df
}

#' Internal function to load zip files, without unzipping them first
#' @importFrom dplyr bind_rows
#' @importFrom readr cols
#' @importFrom readr read_csv
#' @importFrom readr read_tsv
#' @importFrom utils unzip
#' @noRd
#' @keywords Internal
load_zip <- function(cache_file){
  # get names of files stored in .zip
  all_files <- unzip(cache_file, list = TRUE)$Name
  # zip files contain a lot of metadata that `galah` does not import
  # import only those files that meet our criteria for 'data'
  if(is_gbif()){
    available_files <- all_files[grepl(".csv$", all_files)]
    result <- unz(description = cache_file,  # require lapply?
                  filename = available_files) |> 
      read_tsv(col_types = cols()) |>
      suppressWarnings()
    # Note: DOIs for GBIF are stored in `compute()` stage, not in the zip file
  }else{
    available_files <- all_files[grepl(".csv$", all_files) &
                                   grepl("^data|records", all_files)]
    result <- lapply(available_files, 
                     function(a, x){
                       # create connection to a specific file within zip
                       conn <- unz(description = x, 
                                   filename = a, 
                                   open = "rb")
                       out <- read_csv(conn, 
                                       col_types = cols()) |>
                         suppressWarnings()
                       close(conn)
                       return(out)
                     }, x = cache_file) |>
      bind_rows()
    # # add doi when mint_doi = TRUE
    if(any(all_files == "doi.txt")){
      conn <- unz(description = cache_file, 
                  filename = "doi.txt", 
                  open = "rb")
      attr(result, "doi") <- readr::read_file(conn) |>
        sub("\\n$", "", x = _)
      close(conn)
    }
  }
  result
}