R/cas_download_internal.R

Defines functions cas_download_internal

Documented in cas_download_internal

#' Downloads one file at a time with readLines
#'
#' Mostly used internally by `cas_download`.
#'
#' @param download_df A data frame with four columns: `id`, `url`, `path`, `type`.
#' @param overwrite_file Logical, defaults to FALSE.
#'
#' @return Invisibly returns the full `httr` response.
#' @inheritParams cas_download
#' @inheritParams cas_write_to_db
#' @export
#'
#' @examples
cas_download_internal <- function(download_df = NULL,
                                  index = FALSE,
                                  index_group = NULL,
                                  overwrite_file = FALSE,
                                  ignore_id = TRUE,
                                  wait = 1,
                                  create_folder_if_missing = NULL,
                                  db_connection = NULL,
                                  disconnect_db = FALSE,
                                  sample = FALSE,
                                  file_format = "html",
                                  ...) {
  type <- dplyr::if_else(condition = index,
    true = "index",
    false = "contents"
  )

  db <- cas_connect_to_db(
    db_connection = db_connection,
    ...
  )

  if (is.null(download_df)) {
    download_df <- cas_get_files_to_download(
      index = index,
      index_group = index_group,
      create_folder_if_missing = create_folder_if_missing,
      ignore_id = ignore_id,
      db_connection = db,
      disconnect_db = FALSE,
      file_format = file_format,
      ...
    ) %>%
      dplyr::collect()
  }

  if (nrow(download_df) == 0) {
    cli::cli_inform(c(i = "No new files or pages to download."))
    return(invisible(NULL))
  } else {
    current_batch_folder <- fs::path_dir(path = download_df[["path"]][1])
    if (fs::file_exists(current_batch_folder) == FALSE) {
      fs::dir_create(path = current_batch_folder)
      cli::cli_inform(c(i = "The folder {.path {current_batch_folder}} for the current download batch has been created."))
    }
  }

  if (is.numeric(sample) == TRUE) {
    download_df <- download_df %>%
      dplyr::slice_sample(n = sample)
  } else if (isTRUE(sample)) {
    download_df <- download_df %>%
      dplyr::slice_sample(p = 1)
  }

  pb <- progress::progress_bar$new(total = nrow(download_df))

  purrr::walk(
    .x = purrr::transpose(download_df),
    .f = function(x) {
      pb$tick()
      if (fs::file_exists(x$path) == FALSE | overwrite_file == TRUE) {
        raw <- tryCatch(
          expr = download.file(
            url = x$url,
            destfile = x$path
          ),
          error = function(e) {
            e
          }
        )

        if (inherits(raw, "error") == FALSE) {
          info_df <- tibble::tibble(
            id = x$id,
            batch = x$batch,
            datetime = Sys.time(),
            status = 200L,
            size = fs::file_size(x$path)
          )

          cas_write_to_db(
            df = info_df,
            table = stringr::str_c(type, "_", "download"),
            db_connection = db,
            disconnect_db = FALSE,
            ...
          )
        } else {
          cli::cli_alert_danger(c(x = raw))
          cli::cli_warn("Error while downloading page with id {.val {x$id}} and url {.url {x$url}}. The process will proceed with other pages.")
        }
        Sys.sleep(time = wait)
      }
    }
  )

  cas_disconnect_from_db(
    db_connection = db,
    disconnect_db = disconnect_db
  )
}
giocomai/castarter documentation built on May 4, 2024, 1:14 a.m.