#' rsync Hathi Trust EFs from Hathi Trust
#'
#' Given a vector of Hathi Trust IDs (generated, for example, from
#' [workset_builder]), this function attempts to use
#' [rsync](https://en.wikipedia.org/wiki/Rsync) to download their Extracted
#' Features (per-page word counts and part of speech information) files from
#' Hathi Trust. `rsync` needs to be installed in your system and accessible via
#' [system] or [system2].
#'
#' @param htids A vector of Hathi Trust IDs, a workset generated by
#' [workset_builder], or a data frame with a column named 'htid'.
#' @param dir The directory to save the downloaded EF files to. This defaults to
#' `getOption("hathiTools.ef.dir")`, which is just "./hathi-ef" on loading.
#'
#' @return The return code of [system] or [system2] used to run `rsync`, which
#' must be installed on your system.
#' @export
#'
#' @examples
#' \donttest{
#' res <- workset_builder(c("tylenol", "paracetamol"), volumes_only = FALSE)
#' rsync_from_hathi(res$htid[1:10], dir = tempdir())
#' }
rsync_from_hathi <- function(htids, dir = getOption("hathiTools.ef.dir")) {
htids <- check_htids(htids)
if(!fs::dir_exists(dir)) {
message("Creating directory to sync JSON EF files at ", dir)
}
hathi_ef_directory <- fs::dir_create(dir) %>%
fs::path_rel()
tmp <- tempfile(tmpdir = ".", fileext = ".txt")
suppressMessages(htlinks <- htid_to_rsync(htids, tmp))
windows <- Sys.info()["sysname"] == "Windows"
if(!windows) {
command_args <- paste("-av --files-from",
tmp,
"data.analytics.hathitrust.org::features-2020.03/",
hathi_ef_directory)
result <- system2("rsync", args = command_args)
} else {
command <- paste("rsync -av --files-from",
paste0("\'", tmp, "\'"),
"data.analytics.hathitrust.org::features-2020.03/",
paste0("\'", hathi_ef_directory,
"\'"))
command <- stringr::str_replace_all(command, "\\\\", "/")
command <- stringr::str_glue('bash -c {shQuote(command)}')
result <- system(command)
cat(result, "\n")
}
fs::file_delete(tmp)
result
}
#' Converts a list of htids to relative paths for rsync to download
#'
#' @param htids A character vector of HathiTrust ids (htids), a workset
#' generated by [workset_builder], or a data frame with a column named 'htid'
#' and containing the htids.
#' @param file A text file to save the resulting list of relative stubbytree
#' paths to use in the command `rsync -av --files-from FILE.txt
#' data.analytics.hathitrust.org::features-2020.03/ hathi-ef/`
#'
#' @section Details:
#'
#' If you have a lot of files to download, generating the list of relative
#' stubbytree paths and using rsync is much faster than using
#' [get_hathi_counts] over a list of htids. But rsync only downloads json
#' files, so calling [get_hathi_counts] on a downloaded json file will be
#' slower the first time as the function will cache the json file to csv or
#' another format. It is best to run [cache_htids] after using rsync to reduce
#' this performance penalty.
#'
#' @return The list of relative paths saved to the file (invisibly).
#' @export
#'
#' @examples
#' htid_to_rsync(c("nc01.ark:/13960/t2v41mn4r", "mdp.39015001796443"), tempfile())
htid_to_rsync <- function(htids, file) {
htids <- check_htids(htids)
rel_paths <- htids %>%
purrr::map_chr(stubby_url_to_rsync)
writeLines(rel_paths, file)
message(stringr::str_glue("Use rsync -av --files-from {file}"),
" data.analytics.hathitrust.org::features-2020.03/ ",
"hathi-ef/ to download EF files to hathi-ef directory")
invisible(rel_paths)
}
check_htids <- function(htids) {
if("workset_hathi" %in% class(htids)) {
htids <- htids$htid
} else if("data.frame" %in% class(htids)) {
if(!"htid" %in% names(htids)) {
stop("Cannot find a column named 'htid' in the data frame of htids")
}
htids <- htids$htid
} else if(!is.character(htids)) {
stop("htids must be a character vector, a workset produced via workset_builder",
", or a data frame with a column named 'htid' and containing the htids.")
}
unique(htids)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.