castarter: Content Analysis Starter Toolkit

Documented in cas_get_files_to_extract

#' Get path to (locally available) files to be extracted
#'
#' Mostly used internally by [cas_extract] or for troubleshooting.
#'
#' @param only_available Defaults to TRUE. If TRUE, returns only files available
#'   locally. If FALSE, returns also path to files that according to logging
#'   data have already been downloaded, yet are not available where expected.
#' @inheritParams cas_extract
#'
#' @return
#' @export
#'
#' @examples
#' #'
#' \dontrun{
#' if (interactive) {
#'   cas_get_files_to_extract()
#' }
#' }
cas_get_files_to_extract <- function(
  id = NULL,
  ignore_id = TRUE,
  custom_path = NULL,
  index = FALSE,
  store_as_character = TRUE,
  check_previous = TRUE,
  db_connection = NULL,
  file_format = "html",
  sample = FALSE,
  keep_if_status = 200,
  only_available = TRUE,
  ...
) {
  ellipsis::check_dots_unnamed()

  db <- cas_connect_to_db(
    db_connection = db_connection,
    ...
  )

  type <- dplyr::if_else(condition = index, true = "index", false = "contents")

  if (is.null(custom_path)) {
    path <- cas_get_base_path(...)
  } else {
    path_ending <- stringr::str_c(file_format, type, sep = "_")

    custom_path_ending <- fs::path_file(custom_path)

    if (path_ending == custom_path_ending) {
      path <- fs::path(custom_path)
    } else {
      path <- fs::path(
        custom_path,
        path_ending
      )
    }
  }

  previous_download_df <- cas_read_db_download(
    index = index,
    db_connection = db,
    disconnect_db = FALSE,
    ...
  ) |>
    dplyr::arrange(dplyr::desc(datetime)) |>
    dplyr::distinct(id, .keep_all = TRUE) |>
    dplyr::arrange(id, batch, datetime) |>
    dplyr::filter(status %in% keep_if_status)

  stored_files_df <- previous_download_df |>
    dplyr::select("id", "batch") |>
    dplyr::mutate(
      path = fs::path(
        path,
        batch,
        stringr::str_c(id, "_", batch, ".", file_format)
      )
    ) |>
    dplyr::select("id", "path")

  if (store_as_character == TRUE) {
    stored_files_df <- stored_files_df |>
      dplyr::mutate(id = as.character(id))
  }

  if (check_previous == FALSE) {
    files_to_extract_pre_df <- stored_files_df
    write_to_db <- FALSE
  } else {
    # Do not process previously extracted
    previously_extracted_df <- cas_read_db_contents_data(
      db_connection = db,
      disconnect_db = FALSE,
      ...
    )

    if (is.null(previously_extracted_df) == FALSE) {
      previously_extracted_df <- previously_extracted_df |>
        dplyr::select(id) |>
        dplyr::collect()

      if (store_as_character == TRUE) {
        previously_extracted_df <- previously_extracted_df |>
          dplyr::mutate(id = as.character(id))
      }
    }

    if (is.null(previously_extracted_df) == FALSE) {
      files_to_extract_pre_df <- dplyr::anti_join(
        x = stored_files_df,
        y = previously_extracted_df,
        by = "id"
      )
    } else {
      files_to_extract_pre_df <- stored_files_df
    }
  }

  if (nrow(files_to_extract_pre_df) == 0) {
    empty_df <- tibble::tibble(
      id = character(),
      path = fs::path(),
      url = character(),
      link_text = character(),
      source_index_id = double(),
      source_index_batch = double(),
      available = logical()
    )
    return(empty_df)
  }

  contents_id_df <- cas_read_db_contents_id(
    db_connection = db,
    disconnect_db = FALSE,
    ...
  ) |>
    dplyr::collect()

  if (store_as_character == TRUE) {
    contents_id_df <- contents_id_df |>
      dplyr::mutate(id = as.character(id))
  }

  files_to_extract_df <- files_to_extract_pre_df |>
    dplyr::inner_join(
      y = contents_id_df,
      by = "id"
    )

  if (is.null(id) == FALSE) {
    id_to_keep <- id
    files_to_extract_df <- files_to_extract_df |>
      dplyr::filter(id %in% id_to_keep)
  }

  if (isTRUE(ignore_id)) {
    ignore_id <- cas_read_db_ignore_id(
      db_connection = db,
      disconnect_db = FALSE
    ) |>
      dplyr::pull(id)

    files_to_extract_df <- files_to_extract_df |>
      dplyr::filter(!(id %in% ignore_id))
  } else if (is.numeric(ignore_id)) {
    files_to_extract_df <- files_to_extract_df |>
      dplyr::filter(!(id %in% ignore_id))
  }

  if (is.numeric(sample) == TRUE) {
    if (sample > nrow(files_to_extract_df)) {
      sample <- nrow(files_to_extract_df)
    }

    files_to_extract_df <- files_to_extract_df |>
      dplyr::slice_sample(n = sample)
  } else if (isTRUE(sample)) {
    files_to_extract_df <- files_to_extract_df |>
      dplyr::slice_sample(p = 1)
  }

  available_files_to_extract_df <- files_to_extract_df |>
    dplyr::mutate(available = fs::file_exists(path)) |>
    dplyr::filter(available)

  if (nrow(available_files_to_extract_df) != nrow(files_to_extract_df)) {
    cli::cli_warn(c(
      `x` = glue::glue(
        "Not all downloaded files are currently available in their expected location."
      ),
      `*` = glue::glue(
        "Total files expected:  {scales::number(nrow(files_to_extract_df))}"
      ),
      `*` = glue::glue(
        "Total files available: {scales::number(nrow(available_files_to_extract_df))}"
      ),
      `i` = glue::glue(
        "Only available files will be processed. Consider running `cas_restore()` or otherwise deal with missing files as needed."
      )
    ))
  }

  if (only_available) {
    available_files_to_extract_df
  } else {
    files_to_extract_df |>
      dplyr::mutate(available = fs::file_exists(path))
  }
}