R/cas_db_index.R

Defines functions cas_read_db_download cas_read_db_index cas_write_db_index

Documented in cas_read_db_download cas_read_db_index cas_write_db_index

#' Write index URLs to local database
#'
#' If some URLs are already included in the database, it appends only the new
#' ones: URLs are expected to be unique.
#'
#' @param urls A data frame with three columns, with the same name and type as
#'   \code{casdb_empty_index_id}, or a character vector.
#'
#' @inheritParams cas_write_to_db
#'
#' @return Invisibly returns only new rows added.
#' @export
#'
#' @examples
#'
#' cas_set_options(
#'   base_folder = fs::path(fs::path_temp(), "R", "castarter_data"),
#'   db_folder = fs::path(fs::path_temp(), "R", "castarter_data"),
#'   project = "example_project",
#'   website = "example_website"
#' )
#' cas_enable_db()
#'
#'
#' urls_df <- cas_build_urls(
#'   url = "https://www.example.com/news/",
#'   start_page = 1,
#'   end_page = 10
#' )
#'
#' cas_write_db_index(urls = urls_df)
#'
#' cas_read_db_index()
cas_write_db_index <- function(
  urls,
  overwrite = FALSE,
  db_connection = NULL,
  disconnect_db = FALSE,
  ...
) {
  if (is.data.frame(urls)) {
    if (
      identical(colnames(urls), colnames(casdb_empty_index_id)) &
        identical(sapply(urls, class), sapply(casdb_empty_index_id, class))
    ) {
      urls_df <- urls
    } else {
      cli::cli_abort(
        "{.var urls} data frame must match exactly the column names and types of {.code casdb_empty_index_id}"
      )
    }
  } else {
    urls_df <- cas_build_urls(
      url = urls,
      url_ending = "",
      start_page = NULL,
      end_page = NULL
    )
  }

  if (cas_check_use_db(...) == FALSE) {
    return(invisible(NULL))
  }

  db <- cas_connect_to_db(
    db_connection = db_connection,
    ...
  )

  previous_index_df <- cas_read_db_index(
    db_connection = db,
    disconnect_db = FALSE,
    ...
  ) |>
    dplyr::collect()

  if (nrow(previous_index_df) > 0) {
    urls_to_add_df <- urls_df |>
      dplyr::anti_join(
        y = previous_index_df,
        by = c("url", "index_group")
      )

    if (sum(is.element(urls_to_add_df$id, previous_index_df$id)) > 0) {
      cli::cli_inform("Introducing new {.code id} to ensure unique values.")
      urls_to_add_df$id <- seq(
        sum(max(previous_index_df$id), 1),
        sum(
          max(previous_index_df$id),
          nrow(urls_to_add_df)
        )
      ) |>
        as.numeric()
    }
  } else {
    urls_to_add_df <- urls_df
  }

  urls_to_add_n <- nrow(urls_to_add_df)
  if (urls_to_add_n > 0) {
    cas_write_to_db(
      df = urls_to_add_df,
      table = "index_id",
      overwrite = overwrite,
      disconnect_db = FALSE,
      db_connection = db,
      ...
    )

    cli::cli_inform(
      c(v = "Urls added to {.field index_id} table: {.val {urls_to_add_n}}")
    )
  } else {
    cli::cli_inform(
      c(i = "No new url added to {.field index_id} table.")
    )
  }

  cas_disconnect_from_db(
    db_connection = db,
    disconnect_db = disconnect_db
  )

  invisible(urls_to_add_df)
}

#' Read index from local database
#'
#' @inheritParams cas_write_to_db
#'
#' @return A data frame with three columns and data stored in the `index_id`
#'   table of the local database. The data frame has zero rows if the database
#'   does not exist or no data was previously stored there.
#' @export
#'
#' @examples
#' cas_set_options(
#'   base_folder = fs::path(tempdir(), "R", "castarter_data"),
#'   db_folder = fs::path(tempdir(), "R", "castarter_data"),
#'   project = "example_project",
#'   website = "example_website"
#' )
#' cas_enable_db()
#'
#'
#' urls_df <- cas_build_urls(
#'   url = "https://www.example.com/news/",
#'   start_page = 1,
#'   end_page = 10
#' )
#'
#' cas_write_db_index(urls = urls_df)
#'
#' cas_read_db_index()
cas_read_db_index <- function(
  db_connection = NULL,
  db_folder = NULL,
  index_group = NULL,
  ...
) {
  db_result <- tryCatch(
    cas_read_from_db(
      table = "index_id",
      db_folder = db_folder,
      db_connection = db_connection,
      ...
    ),
    error = function(e) {
      logical(1L)
    }
  )

  if (is.null(db_result)) {
    tibble::as_tibble(casdb_empty_index_id)
  } else if (isFALSE(db_result)) {
    tibble::as_tibble(casdb_empty_index_id)
  } else {
    if (is.null(index_group) == FALSE) {
      db_result |>
        dplyr::filter(index_group == !!index_group)
    } else {
      db_result
    }
  }
}


#' Read index from local database
#'
#' @inheritParams cas_write_to_db
#' @param batch Default to "latest": returns only the path to the file with the
#'   highest batch identifier available. Valid values are: "latest", "all", or a
#'   numeric identifier corresponding to desired batch.
#' @param status Defaults to 200. Keeps only files downloaded with the given
#'   status (can be more than one, given as a vector). If NULL, no filter based
#'   on status is applied.
#'
#' @return A data frame with three columns and data stored in the `index_id`
#'   table of the local database. The data frame has zero rows if the database
#'   does not exist or no data was previously stored there.
#' @export
#'
#' @examples
#' cas_set_options(
#'   base_folder = fs::path(tempdir(), "R", "castarter_data"),
#'   db_folder = fs::path(tempdir(), "R", "castarter_data"),
#'   project = "example_project",
#'   website = "example_website"
#' )
#' cas_enable_db()
#'
#'
#' urls_df <- cas_build_urls(
#'   url = "https://www.example.com/news/",
#'   start_page = 1,
#'   end_page = 10
#' )
#'
#' cas_write_db_index(urls = urls_df)
#'
#' cas_read_db_index()
cas_read_db_download <- function(
  index = FALSE,
  id = NULL,
  batch = "latest",
  status = 200L,
  db_connection = NULL,
  db_folder = NULL,
  ...
) {
  type <- dplyr::if_else(
    condition = index,
    true = "index",
    false = "contents"
  )

  db_result <- tryCatch(
    cas_read_from_db(
      table = stringr::str_c(type, "_", "download"),
      db_connection = db_connection,
      db_folder = db_folder,
      ...
    ),
    error = function(e) {
      logical(1L)
    }
  )

  if (is.null(db_result)) {
    tibble::as_tibble(casdb_empty_download)
  } else if (isFALSE(db_result)) {
    tibble::as_tibble(casdb_empty_download)
  } else {
    if (ncol(db_result) == 0) {
      tibble::as_tibble(casdb_empty_download)
    } else {
      if (is.null(id) == FALSE) {
        db_result <- db_result |>
          dplyr::filter(id %in% {{ id }})
      }

      if (is.null(batch) == TRUE) {
        # do nothing
      } else if (is.numeric(batch)) {
        db_result <- db_result |>
          dplyr::filter(batch %in% {{ batch }})
      } else if (batch == "latest") {
        db_result <- db_result |>
          dplyr::slice_max(batch, n = 1, by = "id")
      }

      if (is.null(status) == TRUE) {
        # do nothing
      } else if (is.numeric(status)) {
        db_result <- db_result |>
          dplyr::filter(status %in% {{ status }})
      }

      db_result |>
        dplyr::collect() |>
        dplyr::mutate(
          datetime = lubridate::as_datetime(datetime),
          size = fs::as_fs_bytes(size)
        )
    }
  }
}
giocomai/castarter documentation built on June 12, 2025, 8:49 p.m.