# vim:textwidth=128:expandtab:shiftwidth=4:softtabstop=4
argoFloatsCacheEnv <- new.env(parent=emptyenv())
#' Check Whether an Item is Cached
#'
#' @param name character value, naming the item.
#'
#' @param debug an integer, passed to [argoFloatsDebug()].
#'
#' @export
#'
#' @return A logical value indicating whether a cached value is available.
#'
#' @family functions relating to cached values
#'
#' @author Dan Kelley, making a thin copy of code written by Dewey Dunnington
argoFloatsIsCached <- function(name, debug=0L)
{
argoFloatsDebug(debug, "argoFloatsIsCached(name=\"", name, "\") {\n", sep="", style="bold", unindent=1)
rval <- name %in% names(argoFloatsCacheEnv)
argoFloatsDebug(debug, "returning", rval, "\n")
argoFloatsDebug(debug, "} argoFloatsIsCached()\n", style="bold", unindent=1)
rval
}
#' Check When an Item was Cached
#'
#' @param name character value, naming the item.
#'
#' @param debug an integer, passed to [argoFloatsDebug()].
#'
#' @export
#'
#' @return A logical value indicating whether a cached value is available.
#'
#' @family functions relating to cached values
#'
#' @author Dan Kelley, making a thin copy of code written by Dewey Dunnington
argoFloatsWhenCached <- function(name, debug=0L)
{
argoFloatsDebug(debug, "argoFloatsWhenCached(name=\"", name, "\") {\n", sep="", style="bold", unindent=1)
nameTime <- paste0(name, "Time")
rval <- argoFloatsCacheEnv[[nameTime]]
argoFloatsDebug(debug, "returning", format(as.POSIXct(rval, start="1970-01-01")), "\n")
argoFloatsDebug(debug, "} argoFloatsWhenCached()\n", style="bold", unindent=1)
rval
}
#' Get an Item From The Cache
#'
#' @param name character value, naming the item.
#'
#' @param debug an integer, passed to [argoFloatsDebug()].
#'
#' @return The cached value, as stored with [argoFloatsStoreInCache()].
#'
#' @family functions relating to cached values
#'
#' @export
argoFloatsGetFromCache <- function(name, debug=0)
{
argoFloatsDebug(debug, "argoFloatsGetFromCache(name=\"", name, "\")\n", sep="", style="bold", unindent=1)
rval <- argoFloatsCacheEnv[[name]]
argoFloatsDebug(debug, "} # argoFloatsGetFromCached()\n", style="bold", unindent=1)
rval
}
#' Store an Item in the Cache
#'
#' @param name character value, naming the item.
#'
#' @param value the new contents of the item.
#'
#' @param debug an integer, passed to [argoFloatsDebug()].
#'
#' @return None (invisible NULL).
#'
#' @family functions relating to cached values
#'
#' @export
argoFloatsStoreInCache <- function(name, value, debug=0)
{
argoFloatsDebug(debug, "argoFloatsStoreInCache(name=\"", name, "\")\n", sep="", style="bold", unindent=1)
argoFloatsCacheEnv[[name]] <- value
now <- Sys.time()
argoFloatsCacheEnv[[paste0(name, "Time")]] <- now
argoFloatsDebug(debug, "} argoFloatsStoreInCache()\n", style="bold", unindent=1)
invisible(NULL)
}
#' Clear the Cache
#'
#' Clear the cache. This is meant mainly for developers working in interactive
#' sessions to test coding changes.
#'
#' @param debug an integer, passed to [argoFloatsDebug()].
#'
#' @export
#'
#' @return integer, returned invisibly, indicating the number of items removed.
#'
#' @family functions relating to cached values
#'
#' @author Dan Kelley, making a thin copy of code written by Dewey Dunnington
argoFloatsClearCache <- function(debug=0L)
{
argoFloatsDebug(debug, "argoFloatsClearCache() {\n", sep="", style="bold", unindent=1)
removed <- 0L
for (name in names(argoFloatsCacheEnv)) {
argoFloatsDebug(debug, "removing \"", name, "\"\n", sep="")
argoFloatsCacheEnv[[name]] <- NULL
removed <- removed + 1L
}
argoFloatsDebug(debug, "} argoFloatsClearCache()\n", style="bold", unindent=1)
invisible(removed)
}
#' List Items in the Cache
#'
#' List items in the cache.
#'
#' @param debug an integer, passed to [argoFloatsDebug()].
#'
#' @export
#'
#' @return character vector of names of items.
#'
#' @family functions relating to cached values
#'
#' @author Dan Kelley, making a thin copy of code written by Dewey Dunnington
argoFloatsListCache <- function(debug=0L)
{
if (!is.numeric(debug)) {
stop("debug parameter must be numeric")
}
argoFloatsDebug(debug, "argoFloatsListCache() {\n", sep="", style="bold", unindent=1)
rval <- names(argoFloatsCacheEnv)
argoFloatsDebug(debug, "} argoFloatsListCache()\n", style="bold", unindent=1)
rval
}
#' Get Data for an Argo Float Profile
#'
#' @param url character value giving the URL for a NetCDF file containing an
#' particular profile of a particular Argo float.
#' @template destdir
#'
#' @param destfile optional character value that specifies the name to be used
#' for the downloaded file. If this is not specified, then a name is determined
#' from the value of `url`.
#'
#' @template age
#'
#' @template retries
#'
#' @template quiet
#'
#' @template debug
#'
#' @return A character value naming the local location of the downloaded file,
#' or `NA` if the file could not be downloaded.
#'
## NOTE: this will take longer than 5s, so I had marked it dontrun on an initial
## CRAN submission. THe CRAN review suggested marking it donttest instead, but
## when I do that, rhub::check_for_cran() gives an error on the getIndex, relating
## to the non-existence of ~/data/argo. But, really, what's the point of these
## examples? The docs are really quite straightforward, and we have a paper,
## besides. I don't see that these examples illustrate anything that the docs
## don't already illustrate, and so I have removed them. --DK
## @examples
##\dontrun{
## # This is not run, because it takes longer than 5s to complete.
## # These examples assume that the ~/data/argo directory exists.
## library(argoFloats)
## library(oce)
##
## # Example 1: a particular file
## url <- "ftp://ftp.ifremer.fr/ifremer/argo/dac/nmdis/2901633/profiles/R2901633_071.nc"
## file <- getProfileFromUrl(url=url)
## argo <- read.argo(file)
## # Note that oce::plot() changes par(mfrow) and par(mar).
## oce::plot(argo, which=c(1, 4, 6, 5))
##
## # Example 2: argo profile nearest Sable Island
## index <- getIndex()
## lon0 <- -59.9149
## lat0 <- 43.9337
##
## dist <- oce::geodDist(index[["longitude"]], index[["latitude"]], lon0, lat0)
## w <- which.min(dist)
## url <- paste0(index[["metadata"]][["ftpRoot"]][1], "/", index[["file"]][w])
## fileSable <- getProfileFromUrl(url=url)
## argoSable <- read.oce(fileSable)
## # Note that oce::plot() changes par(mfrow) and par(mar).
## plot(argoSable, which=c(1, 4, 6, 5))
##}
#'
#' @export
#'
#' @author Dan Kelley
getProfileFromUrl <- function(url=NULL, destdir=argoDefaultDestdir(), destfile=NULL, age=argoDefaultProfileAge(), retries=3, quiet=FALSE, debug=0)
{
argoFloatsDebug(debug, "getProfileFromUrl(url=\"", url, "\", destdir=\"", destdir, "\", destfile=\"",
if (missing(destfile)) "(missing)" else destfile, "\", ...) {", sep="", "\n", style="bold", unindent=1)
if (length(url) != 1)
stop("url must be of length 1, not of length ", length(url))
if (is.null(destfile)) {
destfile <- gsub(".*/(.*).nc", "\\1.nc", url)
argoFloatsDebug(debug, "inferred destfile=\"", destfile, "\" from url.\n", sep="")
}
res <- downloadWithRetries(url=url, destdir=destdir, destfile=destfile, quiet=quiet,
age=age, retries=retries, debug=debug-1)
argoFloatsDebug(debug, "} # getProfileFromUrl()", sep="", "\n", style="bold", unindent=1)
if (is.na(res)) res else paste0(destdir, "/", destfile)
}
#' Get an Index of Available Argo Float Profiles
#'
#' This function gets an index of available Argo float profiles, typically
#' for later use as the first argument to [getProfiles()]. The source for the
#' index may be (a) a remote data repository, (b) a local repository (see the
#' `keep` argument), or (c) a cached RDA file that contains the result
#' of a previous call to [getIndex()] (see the `age` parameter).
#'
#' **Using an index from a remote server**
#'
#' The first step is to construct a URL for downloading, based on the
#' `url` and `file` arguments. That URL will be a string ending in `.gz`,
#' or `.txt` and from this the name of a local file is constructed
#' by changing the suffix to `.rda` and prepending the file directory
#' specified by `destdir`. If an `.rda` file of that name already exists,
#' and is less than `age` days old, then no downloading takes place. This
#' caching procedure is a way to save time, because the download can take
#' from a minute to an hour, depending on the bandwidth of the connection
#' to the
#' server.
#'
#' The resultant `.rda` file, which is named in the return value of this
#' function, holds a list named `index` that holds following elements:
#' * `ftpRoot`, the FTP root stored in the header of the source `file`
#' (see next paragraph).
#' * `server`, the URL at which the index was found, and from
#' which [getProfiles()] can construct URLs from which to
#' download the NetCDF files for individual float profiles.
#' * `filename`, the argument provided here.
#' * `header`, the preliminary lines in the source file that start
#' with the `#` character.
#' * `data`, a data frame containing the items in the source file.
#' The names of these items are determined automatically from
#' `"core"`,`"bgcargo"`, `"synthetic"` files.
#'
#' Some expertise is required in deciding on the value for the
#' `file` argument to [getIndex()]. As of March 2023, the
#' FTP sites
#' `ftp://usgodae.org/pub/outgoing/argo`
#' and
#' `ftp://ftp.ifremer.fr/ifremer/argo`
#' contain multiple index files, as listed in the left-hand column of the
#' following table. The middle column lists nicknames
#' for some of the files. These can be provided as the `file` argument,
#' as alternatives to the full names.
#' The right-hand column describes the file contents.
#' Note that the servers also provide files with names similar to those
#' given in the table, but ending in `.txt`. These are uncompressed
#' equivalents of the `.gz` files that offer no advantage and take
#' longer to download, so [getIndex()] is not designed to work with them.
#' \tabular{lll}{
#' *File Name* \tab *Nickname* \tab *Contents*\cr
#' `ar_greylist.txt` \tab - \tab Suspicious/malfunctioning floats\cr
#' `ar_index_global_meta.txt.gz` \tab - \tab Metadata files\cr
#' `ar_index_global_prof.txt.gz` \tab `"argo"` or `"core"` \tab Argo data\cr
#' `ar_index_global_tech.txt.gz` \tab - \tab Technical files\cr
#' `ar_index_global_traj.txt.gz` \tab `"traj"` \tab Trajectory files\cr
#' `argo_bio-profile_index.txt.gz` \tab `"bgc"` or `"bgcargo"` \tab Biogeochemical Argo data (without S or T)\cr
#' `argo_bio-traj_index.txt.gz` \tab `"bio-traj"` \tab Bio-trajectory files\cr
#' `argo_synthetic-profile_index.txt.gz` \tab `"synthetic"` \tab Synthetic data, successor to `"merge"`\cr
#' }
#'
#' **Using a previously downloaded index**
#'
#' In some situations, it can be desirable to work with local
#' index file that has been copied directly from a remote server.
#' This can be useful if there is a desire to work with the files
#' in R separately from the `argoFloats` package, or with python, etc.
#' It can also be useful for group work, in which it is important for
#' all participants to use the same source file.
#'
#' This need can be handled with [getIndex()], by specifying `filename`
#' as the full path name to the previously downloaded file, and
#' at the same time specifying `server` as NULL. This works for
#' both the raw files as downloaded from the server (which end
#' in `.gz`, and for the R-data-archive files produced by [getIndex()],
#' which end in `.rda`. Since the `.rda` files load an order
#' of magnitude faster than the `.gz` files, this is usually
#' the preferred approach. However, if the `.gz` files are preferred,
#' perhaps because part of a software chain uses python code that
#' works with such files, then it should be noted that calling
#' `getIndex()` with `keep=TRUE` will save the `.gz` file in
#' the `destdir` directory.
#'
#' @param filename character value that indicates the file name to be downloaded
#' from a remote server, or (if `server` is set to NULL) the name of a local
#' file. For the remote case, the value of `server` must be taken from
#' the first column of the table given in \dQuote{Details}, or (for some file types)
#' as in the nickname given in the middle column. Note that the downloaded
#' file name will be based on the full file name given as this argument, and
#' that nicknames are expanded to the full filenames before saving. Note that
#' the downloaded file is in gzipped format (indicated by a file name ending
#' in `.gz`) and it is examined and processed by [getIndex()] to produce an
#' R archive file (ending in `.rda`) that is stored locally. The `.gz` file
#' is discarded by default, unless `keep` is set to TRUE. (See also
#' the documentation on the `server` parameter, next, and the subsection
#' entitled \dQuote{Using a previously-downloaded index}.)
#'
#' @param server an indication of the source for `filename`. There are 2
#' possibilities for this. (1) If `server` is `NULL`, then `filename` is taken
#' to be the name of a local index file (ending in suffix `.gz`) that
#' was previously downloaded from a server. The easiest way to get
#' such a file is with a previous call to [getIndex()] with `keep` set
#' to TRUE. (2) If `server` is a character vector (as is it is by default),
#' it is taken to represent remote servers to be tried as sources
#' for an index file. The use of multiple servers is a way to avoid
#' errors that can result if a server refuses a download request.
#' As of March 2023, the three servers known to work are
#' `"https://data-argo.ifremer.fr"`, `"ftp://ftp.ifremer.fr/ifremer/argo"` and
#' `"ftp://usgodae.org/pub/outgoing/argo"`.
#' These may be referred
#' to with nicknames `"ifremer-https"`, `"ifremer"`and `"usgodae"`.
#' Any URL that can be used in [curl::curl_download()] is a valid value provided
#' that the file structure is identical to the mirrors listed above. See
#' [argoDefaultServer()] for how to provide a default value for `server`.
#'
#' @template destdir
#'
#' @param age numeric value indicating how old a downloaded file
#' must be (in days), for it to be considered out-of-date. The
#' default, [argoDefaultIndexAge()], limits downloads to once per day, as a way
#' to avoid slowing down a workflow with a download that might take
#' a minute or so. Setting `age=0` will force a new
#' download, regardless of the age of the local file, and that
#' age is changed to 0 if `keep` is `TRUE`. The value of `age`
#' is ignored if `server` is NULL (see \dQuote{Using a previously
#' downloaded Index} in \dQuote{Details}).
#'
#' @param quiet logical value indicating whether to silence some
#' progress indicators. The default is to show such indicators.
#'
#' @param keep logical value indicating whether to retain the
#' raw index file as downloaded from the server. This is `FALSE`
#' by default, indicating that the raw index file is to be
#' discarded once it has been analyzed and used to create a cached
#' file (which is an RDA file). Note that if `keep`
#' is `TRUE`, then the supplied value of `age` is converted
#' to 0, to force a new download.
#'
#' @template debug
#'
#' @return An object of class [`argoFloats-class`] with type=`"index"`, which
#' is suitable as the first argument of [getProfiles()].
#'
## NOTE: this will take longer than 5s, so I had marked it dontrun on an initial
## CRAN submission. THe CRAN review suggested marking it donttest instead, but
## when I do that, rhub::check_for_cran() gives an error on the getIndex, relating
## to the non-existence of ~/data/argo. But, really, what's the point of these
## examples? The docs are really quite straightforward, and we have a paper,
## besides. I don't see that these examples illustrate anything that the docs
## don't already illustrate, and so I have removed them. --DK
## @examples
##\dontrun{
## # This is not run, because it takes longer than 5s to complete.
## # Download an index of synthetic Argo/BGC-Argo floats, and plot temporal coverage.
## library(argoFloats)
## i <- getIndex("synthetic")
## summary(i)
## hist(i[["date"]], breaks="years", main="", xlab="Time", freq=TRUE)
##}
#'
#' @references
#' Kelley, D. E., Harbin, J., & Richards, C. (2021). argoFloats: An R package for analyzing
#' Argo data. Frontiers in Marine Science, (8), 636922.
#' \doi{10.3389/fmars.2021.635922}
#'
#' @author Dan Kelley and Jaimie Harbin
#'
#' @importFrom utils read.csv tail
## @importFrom lubridate fast_strptime
## @importFrom curl curl_download
## @importFrom oce processingLogAppend
#' @export
getIndex <- function(filename="core",
server=argoDefaultServer(),
destdir=argoDefaultDestdir(),
age=argoDefaultIndexAge(),
quiet=FALSE,
keep=FALSE,
debug=0L)
{
argoFloatsDebug(debug, "getIndex(filename=\"", filename,
"\", server=",
if (is.null(server)) {
"NULL"
} else {
paste0("c(\"", paste(server, collapse="\", \""), "\")")
},
"\", destdir=\"", destdir, "\") { ...", sep="", "\n", style="bold", showTime=FALSE, unindent=1)
clock0 <- as.numeric(Sys.time())
if (!requireNamespace("oce", quietly=TRUE))
stop("must install.packages(\"oce\"), for getIndex() to work")
if (!requireNamespace("curl", quietly=TRUE))
stop("must install.packages(\"curl\") for getIndex() to work")
if (!is.logical(quiet))
stop("quiet must be a logical value")
if (1 != length(quiet))
stop("quiet must be a single value")
if (!is.logical(keep))
stop("keep must be a logical value")
if (1 != length(keep))
stop("keep must be a single value")
if (keep)
age <- 0
useLocalFile <- is.null(server)
argoFloatsDebug(debug, "Set useLocalFile=", useLocalFile, "\n", sep="")
# Sample file
# ftp://ftp.ifremer.fr/ifremer/argo/dac/aoml/1900710/1900710_prof.nc
# ftp://usgodae.org/pub/outgoing/argo/dac/aoml/1900710/1900710_prof.nc
istraj <- filename %in% c("traj", "bio-traj", "ar_index_global_traj.txt.gz", "argo_bio-traj_index.txt.gz")
res <- new("argoFloats", type="index", subtype=if (istraj) "trajectories" else "cycles")
serverOrig <- server
serverNicknames <- c("ifremer-https"="https://data-argo.ifremer.fr",
"ifremer"="ftp://ftp.ifremer.fr/ifremer/argo",
"usgodae"="ftp://usgodae.org/pub/outgoing/argo")
serverIsNickname <- server %in% names(serverNicknames)
server[serverIsNickname] <- serverNicknames[server[serverIsNickname]]
if (!all(grepl("^[a-z]+://", server))) {
stop("server must be NULL, \"ifremer-https\", \"usgodae\", \"ifremer\", or a vector of urls, but it is ",
if (length(server) > 1) paste0("\"", paste(server, collapse="\", \""), "\"")
else paste0("\"", server, "\""), "\n", sep="")
}
# Ensure that we can save the file
if (!file.exists(destdir))
stop("First, create a directory named '", destdir, "'")
if (!file.info(destdir)$isdir)
stop("'", destdir, "' is not a directory")
# Handle nicknames
filenameOrig <- filename
names <- c("core", "bgc", "bgcargo", "synthetic", "traj", "bio-traj",
"ar_index_global_prof.txt.gz", "argo_bio-profile_index.txt.gz",
"argo_synthetic-profile_index.txt.gz", "ar_index_global_traj.txt.gz",
"argo_bio-traj_index.txt.gz")
if (filename == "core") {
filename <- "ar_index_global_prof.txt.gz"
} else if (filename == "bgcargo" || filename == "bgc") {
filename <- "argo_bio-profile_index.txt.gz"
} else if (filename == "merge" || filename == "merged") {
stop("in getIndex() :\n Merged datasets are no longer available. Try using filename='synthetic'", call.=FALSE)
} else if (filename == "synthetic") {
filename <- "argo_synthetic-profile_index.txt.gz"
} else if (filename == "traj") {
filename <- "ar_index_global_traj.txt.gz"
} else if (filename == "bio-traj") {
filename <- "argo_bio-traj_index.txt.gz"
}
if (useLocalFile) {
if (!file.exists(filename)) {
stop("file \"", filename, "\" not found")
}
} else {
if (!(filename %in% names)) {
stop("filename=\"", filename, "\" is not valid; try one of: ",
paste(names, collapse="\", \""),
"\", or set server=NULL if you want to read a local .rda or .gz file.")
}
}
if (filename != filenameOrig)
argoFloatsDebug(debug, "Converted filename=\"", filenameOrig, "\" to \"", filename, "\".\n", sep="")
# Note: 'url' may contain more than one element
url <- paste(server, filename, sep="/")
# FIXME: the gsub() below is because filename might be a full pathname but
# I have no idea what to do on windows machines.
destfile <- paste(destdir, gsub(".*/", "", filename), sep="/")
# NOTE: we save an .rda file, not the .gz file, for speed of later operations
if (grepl("\\.txt\\.gz$", destfile)) {
destfileRda <- gsub(".txt.gz$", ".rda", destfile)
} else if (grepl("\\.txt", destfile)) {
destfileRda <- gsub(".txt$", ".rda", destfile)
} else if (grepl("\\.rda$", destfile)) {
destfileRda <- NULL # not used anyway; why resave an already-extant file?
} else {
stop("cannot construct .rda filename based on \"", destfile, "\" (please report an issue)")
}
argoFloatsDebug(debug, "Set destfileRda=\"", destfileRda, "\"\n", sep="")
# Handle already-downloaded files
res@metadata$url <- url[1] # FIXME: this does not make sense
res@metadata$header <- NULL
if (!useLocalFile && argoFloatsIsCached(filenameOrig, debug=debug-1)) {
argoFloatsDebug(debug, "getIndex() is about to check the cache\n")
whenCached <- argoFloatsWhenCached(filenameOrig, debug=debug-1)
cacheAge <- (as.numeric(Sys.time()) - as.numeric(whenCached)) / 86400.0
argoFloatsDebug(debug, "cacheAge=", cacheAge, " days, age=", age, " days\n", sep="")
if (cacheAge < age) {
argoFloatsDebug(debug, "cacheAge < age, so using an index that is cached in memory for this R session\n")
res <- argoFloatsGetFromCache(filenameOrig, debug=debug-1)
argoFloatsDebug(debug, "} # getIndex() took", as.numeric(Sys.time()) - clock0,
"seconds (returning from location A)\n", style="bold", unindent=1)
return(res)
} else {
argoFloatsDebug(debug, "cacheAge >= age, so the cached index is not being used\n")
}
}
# See if we have an .rda file that is sufficiently youthful.
if (!useLocalFile && file.exists(destfileRda)) {
destfileAge <- (as.numeric(Sys.time()) - as.numeric(file.info(destfileRda)$mtime)) / 86400 # in days
argoFloatsDebug(debug, "This destfileRda already exists, and its age is ", round(destfileAge, 3), " days.\n", sep="")
if (destfileAge < age) {
argoFloatsDebug(debug, "Using existing destfileRda, since its age is under ", age, " days.\n", sep="")
argoFloatsDebug(debug, "About to load '", destfileRda, "'...\n", sep="")
clock <- as.numeric(Sys.time())
argoFloatsIndex <- NULL # defined again in next line; this is to quieten code-diagnostics
load(destfileRda)
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
# TAG1 (update all such together)
res@metadata$server <- server[1]
res@metadata$ftpRoot <- argoFloatsIndex[["ftpRoot"]]
res@metadata$header <- argoFloatsIndex[["header"]]
res@data$index <- argoFloatsIndex[["index"]]
argoFloatsDebug(debug, "Storing this index in a memory cache, for this R session.\n")
argoFloatsStoreInCache(filenameOrig, res, debug=debug-1L)
argoFloatsDebug(debug, "} # getIndex() took", as.numeric(Sys.time()) - clock0,
"seconds (returning from location B)\n", style="bold", unindent=1)
return(res)
}
argoFloatsDebug(debug, sprintf("Must update destfileRda, since its age equals or exceeds %.2f days\n", age))
}
# We need to download data. We do that to a temporary file, because we will be saving
# an .rda file, not the data on the server.
if (useLocalFile) {
argoFloatsDebug(debug, "Interpreting filename=\"", filename, "\"\n", sep="")
clock <- as.numeric(Sys.time())
if (!file.exists(filename)) {
stop("file \"", filename, "\" not found")
} else {
if (grepl(".gz$", filename)) {
# TAG2 (update all such together)
destfileTemp <- filename
# infer some things from the first 100 lines
first <- readLines(filename, 100)
hash <- which(grepl("^#", first))
res@metadata$header <- first[hash]
lastHash <- tail(hash, 1)
res@metadata$server <- "https://data-argo.ifremer.fr"
warning(".gz index files lack server information; assuming https://data-argo.ifremer.fr")
res@metadata$ftpRoot <- gsub("^[^:]*:[ ]*(.*)$", "\\1", first[which(grepl("^# FTP", first))])
names <- strsplit(first[1 + lastHash], ",")[[1]]
index <- read.csv(filename, skip=2 + lastHash, col.names=names, stringsAsFactors=FALSE, colClasses="character")
index$date <- lubridate::fast_strptime(as.character(index$date), format="%Y%m%d%H%M%S", lt=FALSE, tz="UTC")
index$date_update <- lubridate::fast_strptime(as.character(index$date_update), format="%Y%m%d%H%M%S", lt=FALSE, tz="UTC")
res@data$index <- index
argoFloatsDebug(debug, "Storing this index in a memory cache, for this R session.\n")
argoFloatsStoreInCache(filenameOrig, res, debug=debug-1L)
argoFloatsDebug(debug, "} # getIndex() took", as.numeric(Sys.time()) - clock0,
"seconds (returning from location C)\n", style="bold", unindent=1)
return(res)
} else if (grepl(".rda$", filename)) {
argoFloatsDebug(debug, "Interpeting an .rda file ...\n")
clock <- as.numeric(Sys.time())
RDA <- load(filename)
# TAG1 (update all such together)
old <- get(RDA)
if (!is.list(old)) {
stop("\"", filename, "\" does not hold a list, so it was not created by getIndex()")
}
res@metadata$server <- old$server
res@metadata$ftpRoot <- old$ftpRoot
res@metadata$header <- old$header
res@data$index <- old$index
argoFloatsDebug(debug, "Storing this index in a memory cache, for this R session.\n")
argoFloatsStoreInCache(filenameOrig, res, debug=debug-1L)
argoFloatsDebug(debug, "} # getIndex() took", as.numeric(Sys.time()) - clock0,
"seconds (returning from location C)\n", style="bold", unindent=1)
return(res)
} else {
stop("filename must end in .gz or in .rda")
}
}
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
} else {
argoFloatsDebug(debug, "downloading a remote file ...\n")
clock <- as.numeric(Sys.time())
destfileTemp <- tempfile(pattern="argo", fileext=".gz")
argoFloatsDebug(debug, "Allocated temporary file\n '", destfileTemp, "'.\n", sep="")
failedDownloads <- 0
iurlSuccess <- 0 # set to a positive integer in the following loop, if we succeed
clock <- as.numeric(Sys.time())
argoFloatsDebug(debug, "About to try downloading an index file ...\n")
for (iurl in seq_along(url)) {
argoFloatsDebug(debug, " trying '", url[iurl], "'.\n", sep="")
if (!quiet) {
message(sprintf("Downloading '%s'", url[iurl]))
}
status <- try(curl::curl_download(url=url[iurl], destfile=destfileTemp, mode="wb"))
if (!inherits(status, "try-error")) {
if (failedDownloads > 0) {
message("Downloaded index from ", url[iurl])
}
iurlSuccess <- iurl
break # the download worked
} else if (any(grepl("application callback", status))) {
stop(status)
}
if (iurl == length(url)) {
message(" cannot download from ", server[iurl])
} else {
message(" cannot download from ", server[iurl], ", so moving to next server")
}
failedDownloads <- failedDownloads + 1
}
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
if (0 == iurlSuccess) {
stop(" could not download index from these servers:\n'", paste(url, collapse="'\n'"), "'")
}
}
clock <- as.numeric(Sys.time())
argoFloatsDebug(debug, "About to read the header at the start of the index file ...\n", sep="")
# TAG2 (update all such together)
first <- readLines(destfileTemp, 100)
# Typically, length(ftpRoot) is 2
ftpRoot <- gsub("^[^:]*:[ ]*(.*)$", "\\1", first[which(grepl("^# FTP", first))])
argoFloatsDebug(debug, " ftpRoot=c(\"", paste(ftpRoot, collapse="\", \""), "\")\n", sep="")
hash <- which(grepl("^#", first))
header <- first[hash]
lastHash <- tail(hash, 1)
names <- strsplit(first[1 + lastHash], ",")[[1]]
argoFloatsDebug(debug, " names=c(\"", paste(names, collapse="\", \""), "\")\n", sep="")
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
clock <- as.numeric(Sys.time())
argoFloatsDebug(debug, "Reading index file contents ...\n", sep="")
index <- read.csv(destfileTemp, skip=2 + lastHash, col.names=names, stringsAsFactors=FALSE, colClasses="character")
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
argoFloatsDebug(debug, "Setting out-of-range longitude and latitude to NA ...\n")
clock <- as.numeric(Sys.time())
if ("latitude" %in% names(index)) {
index$latitude <- as.numeric(index$latitude)
index$latitude[abs(index$latitude) > 90] <- NA
} else if ("latitude_max" %in% names(index) && "latitude_min" %in% names(index)) {
index$latitude_max <- as.numeric(index$latitude_max)
index$latitude_min <- as.numeric(index$latitude_min)
#message("the class of lat min is", class(index$latitude_min), "and the class of lat max is", class(index$latitude_max))
} else {
stop("Misconfigured index file: no \"latitude\" data found")
}
if ("longitude" %in% names(index)) {
index$longitude <- as.numeric(index$longitude)
index$longitude[abs(index$longitude) > 360] <- NA
} else if ("longitude_max" %in% names(index) && "longitude_min" %in% names(index)) {
index$longitude_max <- as.numeric(index$longitude_max)
index$longitude_min <- as.numeric(index$longitude_min)
} else {
stop("Misconfigured index file: no \"longitude\" data found")
}
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
clock <- as.numeric(Sys.time())
argoFloatsDebug(debug, "Decoding times ...\n", sep="")
# 2020-12-12: switch from as.POSIXct() to lubridate::fast_strptime(), shaving typically
# 1 second from elapsed times. See below for some timing tests, and
# https://github.com/ArgoCanada/argoFloats/issues/334 for discussion.
#
# Note that we are only 'Suggest'ing lubridate in DESCRIPTION (and requiring it here) because otherwise the user will
# see possibly scary messages about function overriding.
#
#: cat("as.POSIXct() timing:\n")
#: print(system.time(t1 <- as.POSIXct(as.character(index$date), format="%Y%m%d%H%M%S", tz="UTC")))
#: cat("strptime() timing:\n")
#: print(system.time(t2 <- strptime(as.character(index$date), format="%Y%m%d%H%M%S", tz="UTC")))
#: cat("ymd_hms() timing:\n")
#: print(system.time(t3 <- lubridate::ymd_hms(as.character(index$date), quiet=TRUE)))
#: cat("lubridate::fast_strptime() timing:\n")
#: print(system.time(t4 <- lubridate::fast_strptime(as.character(index$date), format="%Y%m%d%H%M%S", lt=FALSE, tz="UTC")))
#: cat("readr::parse_datetime() timing:\n")
#: print(system.time(t5 <- readr::parse_datetime(as.character(index$date), format="%Y%m%d%H%M%S")))
#: cat("tests of matches:\n")
#: cat(" as.POSIXct() same as strptime(): ", all.equal(t1, t2), "\n")
#: cat(" as.POSIXct() same as lubridate::ymd_hms(): ", all.equal(t1, t3), "\n")
#: cat(" as.POSIXct() same as lubridate::fast_strptime(): ", all.equal(t1, t4), "\n")
#: cat(" as.POSIXct() same as readr::parse_datetime(): ", all.equal(t1, t4), "\n")
#
# as.POSIXct() timing:
# user system elapsed
# 2.841 0.048 2.913
# 2.793 0.051 2.867
# 2.850 0.041 2.899
# strptime() timing:
# user system elapsed
# 2.435 0.008 2.451
# 2.426 0.002 2.429
# 2.440 0.002 2.444
# ymd_hms() timing:
# user system elapsed
# 3.383 0.101 3.496
# 3.340 0.102 3.445
# 3.399 0.095 3.499
# lubridate::fast_strptime() timing:
# user system elapsed
# 1.894 0.003 1.903
# 1.860 0.002 1.865
# 1.913 0.002 1.918
# readr::parse_datetime() timing:
# Warning: 1667 parsing failures.
# row col expected actual
# 8956 -- date like %Y%m%d%H%M%S 2.0040404e+13
# 8967 -- date like %Y%m%d%H%M%S 2.0040723e+13
# 9025 -- date like %Y%m%d%H%M%S 2.0040423e+13
# 13984 -- date like %Y%m%d%H%M%S 2.0041001e+13
# 14024 -- date like %Y%m%d%H%M%S 2.0040325e+13
# ..... ... ...................... .............
# See problems(...) for more details.
#
# user system elapsed
# 3.324 0.083 3.475
# 3.293 0.080 3.433
# 3.067 0.080 3.182
if (!requireNamespace("lubridate", quietly=TRUE)) {
stop("must install.packages(\"lubridate\") for getIndex() to work")
}
index$date <- lubridate::fast_strptime(as.character(index$date), format="%Y%m%d%H%M%S", lt=FALSE, tz="UTC")
index$date_update <- lubridate::fast_strptime(as.character(index$date_update), format="%Y%m%d%H%M%S", lt=FALSE, tz="UTC")
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
# If we are working with a local .gz file that was previously downloaded, how do we know the right
# values to save for 'server' and 'ftpRoot'? (Are these used later?)
if (useLocalFile) {
argoFloatsIndex <- list(server="unknown", ftpRoot=ftpRoot, header=header, index=index)
} else {
argoFloatsIndex <- list(server=server[iurlSuccess], ftpRoot=ftpRoot, header=header, index=index)
}
clock <- as.numeric(Sys.time())
argoFloatsDebug(debug, "Saving the RDA file as \"", destfileRda, "\" ...\n")
save(argoFloatsIndex, file=destfileRda)
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
if (keep) {
to <- paste0(destdir, "/", gsub(".*/", "", url[iurlSuccess]))
argoFloatsDebug(debug, "Storing temporary raw index file\n '", destfileTemp, "'\n locally as\n '", to, "'.\n", sep="")
file.copy(from=destfileTemp, to=to)
message("Saved the original index file as \"", to, "\"")
}
argoFloatsDebug(debug, "Cleaning up ...\n")
clock <- as.numeric(Sys.time())
if (useLocalFile) {
# FIXME: what to save for server and url?
res@metadata$server <- server[1]
res@metadata$url <- url[1]
} else {
# Guess on server and URL for a downloaded file
argoFloatsDebug(debug, "removing \"", destfileTemp, "\"\n", sep="")
unlink(destfileTemp)
res@metadata$server <- server[iurlSuccess]
res@metadata$url <- url[iurlSuccess]
}
res@metadata$ftpRoot <- argoFloatsIndex$ftpRoot
res@metadata$header <- argoFloatsIndex$header
res@data$index <- argoFloatsIndex$index
res@processingLog <- oce::processingLogAppend(res@processingLog,
paste("getIndex(server=",
if (length(serverOrig) == 1) paste("\"", serverOrig, "\", ", sep="")
else paste("c(\"", paste(serverOrig, collapse="\", \""), "\"), ", sep=""),
"filename=\'", filename, "\", age=", age, ")", sep=""))
argoFloatsDebug(debug, "... took", as.numeric(Sys.time()) - clock, "seconds\n")
argoFloatsDebug(debug, "Storing newly-read index in memory for this R session ...\n")
argoFloatsStoreInCache(filenameOrig, res, debug=debug-1)
argoFloatsDebug(debug, "} getIndex() took ", as.numeric(Sys.time()) - clock0,
"seconds (returning from location D)\n", style="bold", unindent=1)
res
}
#' Get "cycles"/"trajectory" Files Named in an argoFloats Index
#'
#' This takes an index constructed with [getIndex()], possibly
#' after focusing with [subset,argoFloats-method()], and creates
#' a list of files to download from the server named in the index.
#' Then these files are downloaded to the `destdir` directory,
#' using filenames inferred from the source filenames. The
#' value returned by [getProfiles()] is suitable for use
#' by [readProfiles()].
#'
#' It should be noted that the constructed server URL follows
#' a different pattern on the USGODAE an Ifremer servers, and
#' so if some other server is used, the URL may be wrong, leading
#' to an error. Similarly, if the patterns on these two
#' servers change, then [getProfiles()] will fail. Users who
#' encounter such problems are requested to report them
#' to the authors.
#'
#' If a particular data file cannot be downloaded after multiple trials, then
#' the behaviour depends on the value of the `skip` argument. If that is
#' `TRUE` then a `NA` value is inserted in the corresponding
#' spot in the return value, but if it is `FALSE` (the default), then an error is reported.
#' Note that [readProfiles()] skips over any such `NA` entries,
#' while reporting their positions within `index`.
#' For more on this function, see Kelley et al. (2021).
#'
#' @param index an [`argoFloats-class`] object of type `"index"`, as created
#' by [getIndex()].
#'
#' @template destdir
#'
#' @param age Option 1) a numerical value indicating a time interval, in days.
#' If the file to be downloaded from the server already exists locally,
#' and was created is less than age days in the past, it will not be downloaded.
#' The default is one year. Setting age=0 will force a download.
#' Option 2) "latest" meaning the file will only be downloaded if
#' A) the file doesn't exist or B) the file does exist and the time
#' it was created is older than the date_update in the index file
#'
#' @template retries
#'
#' @template skip
#'
#' @template quiet
#'
#' @template debug
#'
#' @return An object of class [`argoFloats-class`] with type=`"profiles"`, the
#' `data` slot of which contains two items: `url`,
#' which holds the URLs from which the NetCDF
#' files were downloaded, and `file`, which
#' holds the path names of the downloaded files; the latter
#' is used by [readProfiles()].
#'
## NOTE: this will take longer than 5s, so I had marked it dontrun on an initial
## CRAN submission. THe CRAN review suggested marking it donttest instead, but
## when I do that, rhub::check_for_cran() gives an error on the getIndex, relating
## to the non-existence of ~/data/argo. But, really, what's the point of these
## examples? The docs are really quite straightforward, and we have a paper,
## besides. I don't see that these examples illustrate anything that the docs
## don't already illustrate, and so I have removed them. --DK
## @examples
##\dontrun{
## # This is not run, because it takes longer than 5s to complete.
## library(argoFloats)
## data(index)
## index2 <- subset(index, 1:2)
## profiles2 <- getProfiles(index2)
## # See ?readProfiles for how to read the files now downloaded.
##}
#'
#' @references
#' Kelley, D. E., Harbin, J., & Richards, C. (2021). argoFloats: An R package for analyzing
#' Argo data. Frontiers in Marine Science, (8), 636922.
#' \doi{10.3389/fmars.2021.635922}
#'
#' @author Dan Kelley
#'
## @importFrom oce processingLogAppend
#'
#' @export
getProfiles <- function(index, destdir=argoDefaultDestdir(), age=argoDefaultProfileAge(), retries=3, skip=TRUE, quiet=TRUE, debug=0)
{
if (!inherits(index, "argoFloats") || index[["type"]] != "index") {
stop("'index' must be an object created with getIndex() or subset()")
}
debug <- max(0, min(3, floor(debug+0.5)))
if (!requireNamespace("oce", quietly=TRUE)) {
stop("must install.packages(\"oce\") for getProfiles() to work")
}
n <- length(index@data$index)
argoFloatsDebug(debug, "getProfiles(..., age=", age, ", ...) {\n", style="bold", showTime=FALSE, unindent=1, sep="")
if (missing(index)) {
stop("In getProfiles() : must provide an index, as created by getIndex()", call.=FALSE)
}
if (!inherits(index, "argoFloats")) {
stop("'index' must be an object created by getIndex()")
}
res <- new("argoFloats", type="profiles")
n <- length(index[["file"]])
if (n == 0) {
warning("In getProfiles() : the index has no files, so there is nothing to 'get'\n", call.=FALSE)
file <- character(0)
} else {
file <- rep("", n)
argoFloatsDebug(debug, "index[[\"ftpRoot\"]] is c(\"",
paste(index[["ftpRoot"]], collapse="\", \""), "\")\n", sep="")
argoFloatsDebug(debug, "index[[\"file\"]] is \"", index[["file"]], "\"\n", sep="")
# The Construction of the remote filename is tricky was changed on 2020-04-30. Previously,
# we used "ftpRoot", inferred from the "# FTP" line in the header in the index file.
# However, as discussed at https://github.com/ArgoCanada/argoFloats/issues/82,
# this decision was faulty, in the sense that it sometimes worked and sometimes
# failed, for the *same* index object -- that is, the file structure on the server
# must be changeable. So, as a test (which will need to be reevaluated over time),
# we switched to using "server" instead of "ftpRoot". That means that we seek
# the NetCDF files from the same source as the index, and *not* from the source listed
# in the "# FTP" line in the header within that index.
server <- index[["server"]]
# I *thought* the USGODAE and IFREMER servers were once set up differently, with only usgodae having "dac" in the path
# name. That is why the next block was written. However, as of May 15, 2020, it seems they are set up in the same
# way, so the Ifremer case was rewritten to match the usgodae case.
urls <- paste0(server, "/dac/", index[["file"]])
argoFloatsDebug(debug, "First entry of urls: \"", urls[1], "\"\n", sep="")
f <- list.files(destdir) # files in directory
# Find files that should *not* be downloaded, because they are of recent age.
useLatest <- age == "latest" && length(f) > 0L
if (useLatest) {
fileNames <- gsub("^.*[/\\\\]([A-Z]*[0-9]*_[0-9]{3,4}[D]{0,1}\\.nc)$", "\\1", index@data$index$file, perl=TRUE) # files of index
skipDownload <- rep(FALSE, length(fileNames))
# download1 indicates files not on local system (see also download2 below)
download1 <- which(!(fileNames %in% f))
# Now keep any that do exist, but are out of date
keep <- which(fileNames %in% f)
path <- paste0(destdir, "/", fileNames[keep])
info <- lapply(path, file.info)
time <- do.call(c, lapply(info, function(x) x$ctime))
# Make times on computer be UTC
timeUTC <- lubridate::with_tz(time, "UTC")
# Determine if computer time is earlier than date_update
dateUpdate <- index[["date_update"]][keep]
# download2 indicates old-age files
download2 <- which(timeUTC < dateUpdate)
argoFloatsDebug(debug, length(download1), "files must be downloaded.\n")
SKIP1 <- which(!(seq_along(fileNames) %in% download1))
SKIP2 <- which(!(seq_along(fileNames[keep]) %in% download2))
skipDownload[SKIP1] <- TRUE
skipDownload[SKIP2] <- TRUE
argoFloatsDebug(length(skipDownload), "files are too young to be downloaded.\n")
}
file <- downloadWithRetries(
urls,
destdir=destdir,
destfile=basename(urls),
quiet=quiet,
age=if (useLatest) ifelse(skipDownload, 100*365, 0.0) else age,
async=TRUE,
debug=debug-1L)
}
res@metadata$destdir <- destdir
res@data$url <- urls
res@data$file <- file
res@processingLog <- oce::processingLogAppend(res@processingLog, "getProfiles(index, ...)")
argoFloatsDebug(debug, "} # getProfiles()\n", style="bold", showTime=FALSE, unindent=1)
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.