R/download.R

Defines functions add_fts_table src_sqlite_institutions create_db institutions_download

Documented in institutions_download

#' Download the GRID dataset and install the local SQLite3 database
#'
#'@param overwrite logical indicating if local db should be overwritten
#'@param cfg the config to use when downloading, by default institutions_cfg()
#'@return logical indicating if the db exists locally, invisibly
#'@importFrom utils download.file
#'@export
institutions_download <- function(overwrite = FALSE, cfg = institutions_cfg()) {

  stopifnot(is.logical(overwrite))

  #cfg <- institutions_cfg()

  if (!dir.exists(dirname(cfg$zip)))
    dir.create(dirname(cfg$zip), recursive = TRUE)

  if (overwrite || !file.exists(cfg$zip)) {
    message("Downloading data from ", cfg$src_url)
    download.file(
      cfg$src_url, destfile = cfg$zip,
      mode = "wb", quiet = TRUE
    )

  }

  if (overwrite || !file.exists(cfg$db)) {
    if (file.exists(cfg$db)) {
      message("Deleting existing db at ", cfg$db)
      file.remove(cfg$db)
    }
    message("Generating db, storing at ", cfg$db)
    create_db()
    message("Generating Full Text Search index")
    add_fts_table()
  }

  invisible(file.exists(cfg$db))
}

#' Create local sqlite3 db from downloaded GRID zip data
#'
#' @importFrom readr read_csv cols
#' @importFrom tools file_path_sans_ext
#' @importFrom stringr str_ends
#' @importFrom zip zip_list
#' @importFrom RSQLite dbWriteTable dbDisconnect
#' @importFrom purrr walk
#' @importFrom dplyr filter pull
#' @importFrom rlang .data
#' @noRd
create_db <- function(db, src_zip) {

  if (missing(db))
    db <- institutions_cfg()$db

  if (missing(src_zip))
    src_zip <- institutions_cfg()$zip

  con <- dbConnect(RSQLite::SQLite(), db)

  zips <-
    src_zip %>%
    zip::zip_list() %>%
    filter(stringr::str_ends(.data$filename, "csv")) %>%
    pull(.data$filename)

  zipcsv_table <- function(filepath) {
    basename(tools::file_path_sans_ext(filepath))
  }

  migrate_table <- function(src_zip, zipcsv, con) {
    df <- read_csv(unz(src_zip, zipcsv),
      col_types = cols(), na = "", guess_max = 1e5, show_col_types = FALSE)
    probs <- readr::problems(df)
    if (nrow(probs) > 0) {
      warning("Parsing issues found for ", zipcsv, " in ", src_zip)
      print(probs)
    }
    dbWriteTable(con, zipcsv_table(zipcsv), df, overwrite = TRUE)
    #    copy_to(con, df, name = zipcsv_table(zipcsv), overwrite = TRUE)
  }

  purrr::walk(zips, function(x) migrate_table(src_zip, x, con))

  dbDisconnect(con)

  invisible(file.exists(db))

}


#' GRID Institutions Database SQLite database connection
#' @importFrom dplyr src_sqlite
#' @importFrom RSQLite dbConnect SQLite
#' @importFrom dbplyr src_dbi
#' @noRd
src_sqlite_institutions <- function() {

  cfg <- institutions_cfg()

  if (!file.exists(cfg$db)) {
    warning("No database available at ", cfg$db,
         " please use institutions_download() first")
    institutions_download()
  }

  RSQLite::dbConnect(RSQLite::SQLite(), cfg$db)
}

#' @importFrom RSQLite dbListTables
#' @noRd
add_fts_table <- function() {

  con <- src_sqlite_institutions()

  if (!"fts" %in% RSQLite::dbListTables(con)) {
    RSQLite::dbExecute(con, statement =
      "create virtual table fts using fts5(institutes, grid_id);")

    RSQLite::dbExecute(con, statement = paste0(
      "insert into fts select name as institutes, grid_id from institutes;"))
  }

  RSQLite::dbDisconnect(con)

}
KTH-Library/institutions documentation built on June 10, 2025, 10:04 a.m.