# makeAggregatedDataset.R Create a virtual dataset (ncml)
#
# Copyright (C) 2016 Santander Meteorology Group (http://www.meteo.unican.es)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#' @description Creates virtual datasets by modifying and combining other datasets via NcML.
#' @title Create a dataset from a collection of (netCDF) files
#' @importFrom rJava J
#' @details The NetCDF Markup Language (NcML) is an XML dialect that allows creating
#' CDM datasets (i.e.: any collection of scientific data which can be accessed
#' through the NetCDF-Java / CDM library). The NcML document refers to
#' another dataset called the referenced CDM dataset, generally composed on a
#' number of netCDF files (but also grib, hdf or many other binary file formats)
#' containing the geo-referenced data. This function creates a NcML file from
#' multiple CDM files that are conveniently combined ("aggregated") along the
#' selected dimension.
#' The use of NcML is not only intended for CDM file combination, and this function considers just a particular case.
#' NcML is a powerful, yet relatively simple way of dealing with large, complex datasets in a straightforward manner.
#' Among other capabilitities, NcML files can be generated in order to add/delete
#' metadata and variables to be renamed, added, deleted and restructured.
#' This function is intended for a simple operation of aggregation of collections of netCDF files,
#' as it is the most common case of gridded climate datasets.
#' @param source.dir Parent directory containing the files to be aggregated
#' @param ncml.file Full path of the output NcML file
#' @param file.ext Character string indicating the extension of the CDM datasets
#' to be aggregated. Default to \code{nc} (netCDF).
#' @param aggr.dim Character string indicating the dimension along which the
#' files will be concatenated. Default to \code{"time"}.
#' @param pattern An optional regular expression. Only file names which match the
#' regular expression will be considered in the aggregation
#' (see \link[base]{regexp}). This argument can be useful in order to save time
#' when only a particular subset of variables from the whole collection is
#' needed. Default to \code{NULL}, meaning that all files in the search path
#' are included (See next argument). See Notes.
#' @param recursive Logical. Should the listing of files to be aggregated recurse
#' into directories?. Default to \code{FALSE}. This is useful for instance when
#' each variable is stored in a sepparate subdirectory.
#' @param verbose Logical. Should additional information of the NcML file creation
#' steps be printed on screen?. Default to \code{TRUE}.
#' @param timeUnitsChange Logical. Set the timeUnitsChange option,
#' useful when aggregating many files across the time dimension which have relative time axis.
#' Default to \code{TRUE}.
#' @return Creates a NcML file at the specified location.
#' @export
#' @note The current implementation of the function only considers datasets in which each file stores
#' one single variable. For other dataset configurations, please refer to the NcML tutorial.
#'
#' \emph{Wildcards} (or \emph{globbing}) patterns as used by most shells can be mapped to regular expression in R
#' using \code{\link[utils]{glob2rx}}.
#'
#' @references NcML Tutorial \url{http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/ncml/Tutorial.html}. (Last accessed 14 Mar 2016.).
#' @author J. Bedia, with contributions of E. Tourigny and A. CofiƱo
#' @family loading
makeAggregatedDataset <- function(source.dir, ncml.file, file.ext = "nc", aggr.dim = "time",
pattern = NULL, recursive = FALSE, verbose = TRUE, timeUnitsChange = TRUE) {
file.ext <- match.arg(file.ext, c("nc", "nc4", "hdf", "grib","gini"))
suffix <- paste("\\.", file.ext, "$", sep = "")
if (is.null(pattern)) {
pattern <- suffix
}
lf <- normalizePath(list.files(source.dir, full.names = TRUE, pattern = pattern, recursive = recursive), winslash = "/")
lf <- grep(suffix, lf, value = TRUE)
message("[",Sys.time(),"] Creating dataset from ", length(lf), " files")
z <- file(ncml.file, "w")
# http://www.unidata.ucar.edu/software/netcdf/ncml/v2.2/Aggregation.html
cat("<?xml version=\"1.0\" encoding=\"UTF-8\"?>", file = z)
cat(c("\n","<netcdf xmlns=\"http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2\">"), sep = "", file = z)
cat(c("\n","\t","<aggregation type=\"union\">"), sep = "", file = z)
varNames <- rep(NA, length(lf))
for (i in 1:length(lf)) {
gds <- J("ucar.nc2.dt.grid.GridDataset")$open(.jnew("java/lang/String", lf[i]))
varNames[i] <- unlist(strsplit(gsub("\\[|]|\\s", "", gds$getGrids()$toString()), ","))[1] # case when you have something like "varName, lon, lat" (e.g. some ENSEMBLES files)
}
vars <- unique(varNames)
#
# Next code chunk takes one unique file for each variable and checks
# if the aggregation dimension exists for that variable.
# This makes sense when aggregation e.g. by "level", and we want to include
# surface variables too in the dataset
#
hasAggrDim <- rep(TRUE, length(vars))
ind <- match(vars, varNames)
for (i in 1:length(vars)) {
if (verbose) {
message("[",Sys.time(),"] Scanning file ", i, " out of ", length(vars))
}
gds <- J("ucar.nc2.dt.grid.GridDataset")$open(lf[ind[i]])
grid <- gds$findGridByShortName(vars[i])
dims <- unlist(strsplit(gsub("\\[|]|;|\\s","", grid$getDimensions()$toString()), split = ","))
hasAggrDim[i] <- any(grepl(aggr.dim, dims))
gds$close()
}
#
# Those variables without the aggregating dimension are directly aggregated
# with "union" at the upper level
#
ind <- which(!hasAggrDim)
if (length(ind) > 0) {
for (i in 1:length(ind)) {
varfile <- grep(vars[ind[i]], lf, value = TRUE)
for (j in 1:length(varfile)) {
cat(c("\n","\t","<netcdf location=", "\"", varfile[j]), "\"/>", sep = "", file = z)
}
}
}
#
# Case when there is a single file per variable. No need to concatenate
# different files of the same variable. The variables are directly aggregated
# with "union" at the upper level
#
if (length(lf) == length(vars)) {
for (i in 1:length(lf)) {
gds <- J("ucar.nc2.dt.grid.GridDataset")$open(lf[i])
grid <- gds$findGridByShortName(vars[i])
dims <- unlist(strsplit(gsub("\\[|]|;|\\s","", grid$getDimensions()$toString()), split = ","))
aggrDimIndex <- as.integer(grep(aggr.dim, dims) - 1)
ncoords <- grid$getDimension(aggrDimIndex)$getLength()
cat(c("\n", "\t", "<netcdf location=", "\"", lf[i], "\" ncoords=\"", ncoords), "\"/>", sep = "", file = z)
}
} else {
#
# Many files per variable are concatenated by "joinExisting" at the next
# aggregation level
#
ind <- which(hasAggrDim)
if (length(ind) < 1) {
stop("Invalid dimension for aggregation")
}
if (verbose) {
message("[",Sys.time(),"] Defining aggregating dimension length\nThis process may be slow but will significantly speed-up data retrieval...")
}
if ( timeUnitsChange ) { strTimeUnitsChange="timeUnitsChange=\"true\"" } else { strTimeUnitsChange="" }
for (i in 1:length(ind)) {
varfile <- grep(vars[ind[i]], lf, value = TRUE)
cat(c("\n","\t","\t","<netcdf>"), sep = "", file = z)
cat(c("\n","\t","\t","<aggregation dimName=\"", aggr.dim, "\" ", "type=\"joinExisting\" ",strTimeUnitsChange,">"), sep = "", file = z)
cat(c("\n","\t","\t","<variableAgg name=\"", vars[ind[i]], "\"/>"), sep = "", file = z)
for (j in 1:length(varfile)) {
gds <- J("ucar.nc2.dt.grid.GridDataset")$open(varfile[j])
grid <- gds$findGridByShortName(vars[ind[i]])
dims <- unlist(strsplit(gsub("\\[|]|;|\\s","", grid$getDimensions()$toString()), split = ","))
aggrDimIndex <- as.integer(grep(aggr.dim, dims) - 1)
ncoords <- grid$getDimension(aggrDimIndex)$getLength()
gds$close()
cat(c("\n","\t","\t","\t","<netcdf location=", "\"", varfile[j]), "\" ncoords=\"", ncoords, "\"/>", sep = "", file = z)
}
cat(c("\n","\t","\t","</aggregation>"), sep = "", file = z)
cat(c("\n","\t","\t","</netcdf>"), sep = "", file = z)
}
if (verbose) {
message("[",Sys.time(),"] Dimension length defined")
}
}
cat(c("\n","\t","</aggregation>"), sep = "", file = z)
cat(c("\n","</netcdf>"), sep = "", file = z)
close(z)
message("[",Sys.time(),"] NcML file \"", ncml.file, "\" created from ", length(lf), " files corresponding to ", length(vars), " variables")
message("Use 'dataInventory' to obtain a description of the dataset")
}
# End
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.