R/rsync_tools.R

Defines functions check_htids htid_to_rsync rsync_from_hathi

Documented in htid_to_rsync rsync_from_hathi

#' rsync Hathi Trust EFs from Hathi Trust
#'
#' Given a vector of Hathi Trust IDs (generated, for example, from
#' [workset_builder]), this function attempts to use
#' [rsync](https://en.wikipedia.org/wiki/Rsync) to download their Extracted
#' Features (per-page word counts and part of speech information) files from
#' Hathi Trust. `rsync` needs to be installed in your system and accessible via
#' [system] or [system2].
#'
#' @param htids A vector of Hathi Trust IDs, a workset generated by
#'   [workset_builder], or a data frame with a column named 'htid'.
#' @param dir The directory to save the downloaded EF files to. This defaults to
#'   `getOption("hathiTools.ef.dir")`, which is just  "./hathi-ef" on loading.
#'
#' @return The return code of [system] or [system2] used to run `rsync`, which
#'   must be installed on your system.
#' @export
#'
#' @examples
#' \donttest{
#' res <- workset_builder(c("tylenol", "paracetamol"), volumes_only = FALSE)
#' rsync_from_hathi(res$htid[1:10], dir = tempdir())
#' }
rsync_from_hathi <- function(htids, dir = getOption("hathiTools.ef.dir")) {

  htids <- check_htids(htids)

  if(!fs::dir_exists(dir)) {
    message("Creating directory to sync JSON EF files at ", dir)
  }

  hathi_ef_directory <- fs::dir_create(dir) %>%
    fs::path_rel()

  tmp <- tempfile(tmpdir = ".", fileext = ".txt")

  suppressMessages(htlinks <- htid_to_rsync(htids, tmp))

  windows <- Sys.info()["sysname"] == "Windows"
  if(!windows) {
    command_args <- paste("-av --files-from",
                          tmp,
                          "data.analytics.hathitrust.org::features-2020.03/",
                          hathi_ef_directory)

    result <- system2("rsync", args = command_args)

  } else {

    command <- paste("rsync -av --files-from",
                     paste0("\'", tmp, "\'"),
                     "data.analytics.hathitrust.org::features-2020.03/",
                     paste0("\'", hathi_ef_directory,
                            "\'"))

    command <- stringr::str_replace_all(command, "\\\\", "/")

    command <- stringr::str_glue('bash -c {shQuote(command)}')

    result <- system(command)
    cat(result, "\n")

  }

  fs::file_delete(tmp)

  result


}

#' Converts a list of htids to relative paths for rsync to download
#'
#' @param htids A character vector of HathiTrust ids (htids), a workset
#'   generated by [workset_builder], or a data frame with a column named 'htid'
#'   and containing the htids.
#' @param file A text file to save the resulting list of relative stubbytree
#'   paths to use in the command `rsync -av --files-from FILE.txt
#'   data.analytics.hathitrust.org::features-2020.03/ hathi-ef/`
#'
#' @section Details:
#'
#'   If you have a lot of files to download, generating the list of relative
#'   stubbytree paths and using rsync is much faster than using
#'   [get_hathi_counts] over a list of htids. But rsync only downloads json
#'   files, so calling [get_hathi_counts] on a downloaded json file will be
#'   slower the first time as the function will cache the json file to csv or
#'   another format. It is best to run [cache_htids] after using rsync to reduce
#'   this performance penalty.
#'
#' @return The list of relative paths saved to the file (invisibly).
#' @export
#'
#' @examples
#' htid_to_rsync(c("nc01.ark:/13960/t2v41mn4r", "mdp.39015001796443"), tempfile())
htid_to_rsync <- function(htids, file) {
  htids <- check_htids(htids)

  rel_paths <- htids %>%
    purrr::map_chr(stubby_url_to_rsync)

  writeLines(rel_paths, file)

  message(stringr::str_glue("Use rsync -av --files-from {file}"),
          " data.analytics.hathitrust.org::features-2020.03/ ",
          "hathi-ef/ to download EF files to hathi-ef directory")

  invisible(rel_paths)

}

check_htids <- function(htids) {
  if("workset_hathi" %in% class(htids)) {
    htids <- htids$htid
  } else if("data.frame" %in% class(htids)) {
    if(!"htid" %in% names(htids)) {
      stop("Cannot find a column named 'htid' in the data frame of htids")
    }
    htids <- htids$htid
  } else if(!is.character(htids)) {
    stop("htids must be a character vector, a workset produced via workset_builder",
         ", or a data frame with a column named 'htid' and containing the htids.")

  }

  unique(htids)
}
xmarquez/hathiTools documentation built on June 2, 2025, 5:12 a.m.