Nothing
#' Load a unique CMIP5 ensemble
#'
#' Loads the data for a particular CMIP5 experiment-variable-model-ensemble
#' combination (one or more files). Returns NULL and a warning if nothing matches.
#'
#' @param variable CMIP5 variable to load (required)
#' @param model CMIP5 model to load (required)
#' @param experiment CMIP5 experiment to load (required)
#' @param ensemble CMIP5 ensemble to load (required)
#' @param domain optinal CMIP5 domain to load (required)
#' @param path optional root of directory tree
#' @param recursive logical. Recurse into directories?
#' @param verbose logical. Print info as we go?
#' @param force.ncdf [[decrepit]] Force use of the less-desirable ncdf package for testing?
#' @param yearRange numeric of length 2. If supplied, load only these years of data inclusively between these years.
#' @param ZRange numeric of length 2. If supplied, load only Z data within this range.
#' @return A \code{\link{cmip5data}} object, or \code{NULL} if nothing loaded
#' @details This function is the core of RCMIP5's data-loading. It loads all files matching
#' the experiment, variable, model, ensemble, and domain supplied by the caller.
#' @note This function is not intended to be called directly by the user; it will return
#' the \code{val} component as a multidimensional array, not a data frame.
#' @note This is an internal RCMIP5 function and not exported.
#' @keywords internal
loadEnsemble <- function(variable, model, experiment, ensemble, domain,
path='.', recursive=TRUE, verbose=FALSE, force.ncdf=FALSE,
yearRange=NULL, ZRange=NULL) {
# Sanity checks - make sure all parameters are correct class and length
assert_that(length(variable)==1 & is.character(variable))
assert_that(length(model)==1 & is.character(model))
assert_that(length(experiment)==1 & is.character(experiment))
assert_that(length(ensemble)==1 & is.character(ensemble))
assert_that(length(domain)==1 & is.character(domain))
assert_that(is.dir(path))
assert_that(is.readable(path))
assert_that(is.flag(recursive))
assert_that(is.flag(verbose))
assert_that(is.flag(force.ncdf))
assert_that(is.null(yearRange) | length(yearRange)==2 & is.numeric(yearRange))
assert_that(is.null(ZRange) | length(ZRange)==2 & is.numeric(ZRange))
# We prefer to use the 'ncdf4' package, but if not installed can use 'ncdf'
if(force.ncdf) {
warning('force.ncdf is ignored now. ncdf4 is a required package')
}
requireNamespace('ncdf4', quietly=!verbose)
.nc_open <- ncdf4::nc_open
.ncatt_get <- ncdf4::ncatt_get
.ncvar_get <- ncdf4::ncvar_get
.nc_close <- ncdf4::nc_close
# List all files that match specifications
fileList <- list.files(path=path, full.names=TRUE, recursive=recursive)
# Match file names with valid CMIP5 patterns:
# ...variable_domain_model_experiment_ensemble followed by either
# ...a '_' or a '.' depending on whether a time period is specified or not.
# ...Note that the '[_\\.]' match differentiates between
# ...ensembles like 'r1i1p1' and 'r1i1p11', an unlikely but possible case.
fileList <- fileList[grepl(pattern=sprintf('^%s_%s_%s_%s_%s[_\\.].*nc$',
variable, domain, model,
experiment, ensemble),
basename(fileList))]
if(length(fileList)==0) {
warning(paste("Could not find any matching files for",
variable, domain, model, experiment, ensemble))
return(NULL)
}
# Get the domains of all files we want to load. Recall CMIP5 naming
# ...conventions:variable_domain_model_experiment_ensemble_time.nc
# ...
domainCheck <- unname(vapply(unlist(fileList),
function(x) { unlist(strsplit(basename(x), '_'))[2] },
FUN.VALUE=''))
# Check that we are only loading one domain. We check this before checking
# other CMIP5 specifications because 'fx' domains will split on '_' to a
# different number of strings then temporal domains.
if(length(unique(domainCheck)) > 1) {
stop('Domain is not unique: [', paste(unique(domainCheck), collapse=' '), ']\n')
}
# Get the number of pieces of CMIP5 information strings
numSplits <- length(unlist(strsplit(basename(fileList[1]), '_')))
# Split all file names based on '_' or '.'
cmipName <- unname(vapply(unlist(fileList),
function(x) { unlist(strsplit(basename(x), '[_\\.]')) },
FUN.VALUE=rep('', length=numSplits+1)))
# List what order the files appear in the name, for CMIP5 this will be:
# ...variable_domain_model_experiment_ensemble_time.nc Note that we
# ...aren't interested in checking the time string
checkField <- list(variable=1, domain=2, model=3, experiment=4, ensemble=5)
# Go through and make sure that we have unique variable, domain, model,
# ...experiment, and ensemble strings for the set of files we are trying
# ...to load. We want to avoid trying to load a file from ModelA and ModelB
# ...as one cmip5data structure here. Also set the appropreate variables
# ...so we can identify the cmip5data object correctly.
for(checkStr in names(checkField)) {
# Pull all unique strings
tempStr <- unique(cmipName[checkField[[checkStr]],])
if(length(tempStr) > 1) { # there should only be one!
stop('[',checkStr, '] is not unique: [', paste(tempStr, collapse=' '), ']\n')
} else {
# Set a variable by the field name to the correct string for
# ...this ensemble. For example there is now a variable
# ...'variable' in the workspace containing the correct string
# ...extracted from the first position of the file name.
eval(parse(text=paste(checkStr, ' <- "', tempStr[1], '"', sep='')))
}
}
# Go through and load the data
val <- c() # variable to temporarily hold main data
timeRaw <- c()
timeArr <- c()
ZUnit <- NULL
valUnit <- NULL
loadedFiles <- c()
# Note that list.files returns a sorted list so these file should already
# be in temporal order if the ensemble is split over multiple files.
for(fileStr in fileList) {
if(verbose) cat('Loading', fileStr, "\n")
nc <- .nc_open(fileStr, write=FALSE)
# Get variable names available
varNames <- unlist(lapply(nc$var, FUN=function(x) { x$name }))
# Get dimension names for 'variable'
dimNames <- unlist(lapply(nc$var[[variable]]$dim, FUN=function(x) { x$name }))
if(verbose) cat("-", variable, "dimension names:", dimNames, "\n")
assert_that(length(dimNames) %in% c(1, 2, 3, 4)) # that's all we know
# Most, but not all, files have longitude and latitude. Load if available.
lonArr <- NULL
lonUnit <- NULL
latArr <- NULL
latUnit <- NULL
if(length(dimNames) > 1) {
# If 'lon' and 'lat' are available, use those. Otherwise load
# whatever is specified by the main variable's dimensionality
# TODO: this is a temporary (?) hack re issue #96
if('lon' %in% varNames) dimNames[1] <- 'lon'
if('lat' %in% varNames) dimNames[2] <- 'lat'
lonArr <- .ncvar_get(nc, varid=dimNames[1])
lonUnit <- .ncatt_get(nc, dimNames[1], 'units')$value
lonLen <- ifelse(length(dim(lonArr)) > 1, dim(lonArr)[1], length(lonArr))
latArr <- .ncvar_get(nc, varid=dimNames[2])
latUnit <- .ncatt_get(nc, dimNames[2], 'units')$value
latLen <- ifelse(length(dim(latArr)) > 1, dim(latArr)[1], length(latArr))
# Some models provide 1-D arrays, some 2-D. Convert all to the latter
if(length(dim(lonArr)) < 2) {
lonArr <- array(lonArr, dim=c(lonLen, latLen))
}
if(length(dim(latArr)) < 2) {
latArr <- array(rep(latArr, 1, each=lonLen), dim=c(lonLen, latLen))
}
}
# Get the time frequency. Note that this should be related to
# ...the domain (ie 'mon' should be the frequency of the domain 'Amon').
# ...In theory we could extract this from the domain
timeFreqStr <- .ncatt_get(nc, varid=0, "frequency")$value
# Non-fixed files have a time dimension to deal with:
if(! timeFreqStr %in% 'fx') {
# Get the time unit (e.g. 'days since 1860')
timeName <- dimNames[length(dimNames)]
timeUnit <- .ncatt_get(nc, timeName, 'units')$value
# Get the type of calendar used (e.g. 'noleap')
calendarStr <- .ncatt_get(nc, timeName, 'calendar')$value
calendarUnitsStr <- .ncatt_get(nc, timeName, 'units')$value
# Extract the number of days in a year
if(grepl('^[^\\d]*\\d{3}[^\\d]day', calendarStr)) {
calendarDayLength <- as.numeric(regmatches(calendarStr,
regexpr('\\d{3}', calendarStr)))
} else {
calendarDayLength <- 365
}
# Extract the year we are referencing in calendar
# Set the default to year 1, month 1, day 1, hour 0, min 0, sec 0
defaultCalendarArr <- c(1, 1, 1, 0, 0, 0)
# Split the calandar unit string based on a '-', space, or ':'
# ...this allows us to deal with YYYY-MM-DD hh:mm:ss, YYYY-MM-DD, or
# ...YYYY-M-D
calendarArr <- unlist(strsplit(calendarUnitsStr, split='[- :]'))
# Check that the time is going to be in days otherwise latter
# ...calculations for time array break
assert_that(any(grepl('day', calendarArr)))
# extract just the digits
calendarArr <- as.numeric(calendarArr[grepl('^\\d+$', calendarArr)])
# swap the default values with the extracted values, we assume
# ... that years are listed before months, before days, and so on
temp <- defaultCalendarArr
temp[1:length(calendarArr)] <- calendarArr
calendarArr <- temp
# calculate the decimal starting year
startYr <- sum((calendarArr - c(0, 1, 1, 0, 0, 0))
/ c(1, 12, calendarDayLength, calendarDayLength*24,
calendarDayLength*24*60, calendarDayLength*24*60*60))
# Load the actual time
thisTimeRaw <- .ncvar_get(nc, varid=timeName)
attributes(thisTimeRaw) <- NULL
assert_that(!any(duplicated(thisTimeRaw)))
# convert from days (we assume the units are days) to years
thisTimeArr <- thisTimeRaw / calendarDayLength + startYr
} else { # this is a fx variable. Set most things to NULL
startYr <- NULL
timeArr <- NULL
timeUnit <- NULL
calendarStr <- NULL
calendarDayLength <- NULL
calendarUnitsStr <- NULL
dim(val) <- dim(val)[1:2]
thisTimeRaw <- NULL
thisTimeArr <- NULL
}
# Load the 4th dimension, if present:
ZArr <- NULL
if(length(dimNames) == 4) {
ZArr <- .ncvar_get(nc, varid=dimNames[3])
attributes(ZArr) <- NULL
ZUnit <- .ncatt_get(nc, dimNames[3], 'units')$value
assert_that(!any(duplicated(ZArr)))
}
# Construct the 'start' and 'count' arrays for ncvar_get below
# (See ncvar_get documentation for what these mean.)
ndims <- nc$var[[variable]]$ndims
start <- rep(1, ndims)
count <- rep(-1, ndims)
# If ZRange supplied, calculate filter for the data load below
# Note this is above the yearRange check so ZArr data don't get erased
# by loading subsequent files with no data
if(!is.null(ZRange) & !is.null(ZArr)) {
Zinrange <- min(ZRange) <= ZArr & max(ZRange) >= ZArr
if(any(Zinrange)) {
zstart <- min(which(Zinrange))
zend <- max(which(Zinrange))
# Modify the 'start' and 'count' arrays for ncvar_get below
ndims <- nc$var[[variable]]$ndims
start[ndims-1] <- zstart
count[ndims-1] <- zend - zstart + 1
ZArr <- ZArr[Zinrange]
if(verbose) cat("- loading only Z values", zstart, "-", zend, "\n")
} else {
if(verbose) cat("- skipping file because not in ZRange\n")
.nc_close(nc)
next
}
}
# If yearRange supplied, calculate filter for the data load below
if(!is.null(yearRange) & !is.null(thisTimeArr)) {
# User has requested to load a temporal subset of the data.
# First question: does this file overlap at all?
yearRange <- sapply(yearRange, floor) # strip off any decimals
if(min(yearRange) > max(floor(thisTimeArr)) |
max(yearRange) < min(floor(thisTimeArr))) {
if(verbose) cat("- skipping file because not in yearRange\n")
.nc_close(nc)
next
}
# Calculate what positions in time array fall within yearRange
# find first time match
tstart <- match(min(yearRange), floor(thisTimeArr))
if(is.na(tstart)) tstart <- 1
# find last time match
tend <- match(max(yearRange)+1, floor(thisTimeArr)) - 1
if(is.na(tend)) tend <- length(thisTimeArr)
# Modify the 'start' and 'count' arrays for ncvar_get below
start[ndims] <- tstart
count[ndims] <- tend-tstart+1
if(verbose) cat("- loading only timeslices", tstart, "-",tend, "\n")
# Trim the already-loaded time arrays to match
thisTimeArr <- thisTimeArr[tstart:tend]
thisTimeRaw <- thisTimeRaw[tstart:tend]
} # if year range
# Update running time data
if(!is.null(thisTimeRaw)) {
# Any overlap between this file's time array and previous data?
if(!is.null(timeArr) && min(thisTimeArr) < max(timeArr)) {
stop("Time overlap between files; this should not occur")
}
timeRaw <- c(timeRaw, thisTimeRaw)
timeArr <- c(timeArr, thisTimeRaw / calendarDayLength + startYr)
}
# Finally, load the actual data and its units
vardata <- .ncvar_get(nc, varid=variable, start=start, count=count)
if(verbose) cat("- data", paste(dim(vardata), collapse=" x "), "\n")
valUnit <- .ncatt_get(nc, variable, 'units')$value # load units
loadedFiles <- c(loadedFiles, basename(fileStr))
# Restore any 'missing' dimensions (because not present, or length=1),
# inserting NAs into dimNames to mark what wasn't present in file
temp <- restoreMissingDims(dim(vardata), dimNames, lonArr, latArr, ZArr,
thisTimeRaw, verbose)
vardata <- array(vardata, dim=temp[["dims"]])
dimNames <- temp[["dimNames"]]
# Test that spatial dimensions are identical across files
if(length(val) > 0 & length(dimNames) > 2) {
assert_that(all(dim(val)[1:(length(dim(val))-1)] ==
dim(vardata)[1:(length(dim(vardata))-1)]))
}
# Bind the main variable along time dimension to previously loaded data
# Note that the time dimension, if present, is guaranteed to be last
# ...see ncdf4 documentation
val <- abind(val, vardata, along=length(dim(vardata)))
.nc_close(nc)
} # for filenames
# If nothing loaded...
if(length(val) == 0){
warning('Nothing was found, returning NULL')
return(NULL)
}
x <- cmip5data(list(files=loadedFiles, val=unname(val), valUnit=valUnit,
lat=latArr, lon=lonArr, Z=ZArr, time=timeArr,
variable=variable, model=model, domain=domain,
experiment=experiment, ensembles=ensemble,
dimNames=dimNames,
debug=list(startYr=startYr,
lonUnit=lonUnit, latUnit=latUnit,
ZUnit=ZUnit,
timeUnit=timeUnit,
timeFreqStr=timeFreqStr,
calendarUnitsStr=calendarUnitsStr,
calendarStr=calendarStr, timeRaw=timeRaw,
calendarDayLength=calendarDayLength)
))
# Add the provenance information, with a line for each loaded file
for(f in fileList) {
x <- addProvenance(x, paste("Loaded", basename(f)))
}
x
} # loadEnsemble
#' Restore missing and/or degenerate dimensions in the data
#'
#' @param dims the data array just loaded from the NetCDF
#' @param dimNames vector of dimensions names present in file
#' @param lonArr numeric vector of longitude values
#' @param latArr numeric vector of latitude values
#' @param ZArr numeric vector of Z values
#' @param thisTimeRaw numeric vector of time values
#' @param verbose logical. Print info as we go?
#' @return The data array with restored dimensions.
#' @note There are two cases to consider here. (1) If we load a dimension with only
#' one value (one month, one depth, etc) then that dimension will be dropped by
#' the .ncvar_get function (there's a 'collapse_degen' option available in ncdf4,
#' but not in ncdf). (2) A dimension is missing entirely, e.g. in a time-only file,
#' or a space-only grid area file. In either case, we want those dimensions (of
#' length 1) back in the data array.
#' @note This is an internal RCMIP5 function and not exported.
#' @keywords internal
restoreMissingDims <- function(dims, dimNames, lonArr, latArr, ZArr, thisTimeRaw, verbose) {
if(is.null(lonArr) | length(lonArr) == 1 ) {
if(verbose) cat("- restoring dimension for lon\n")
dims <- c(1, dims)
}
if(is.null(latArr) | length(latArr) == 1 ) {
if(verbose) cat("- restoring dimension for lat\n")
dims <- c(dims[1], 1, dims[2:length(dims)])
}
if(length(ZArr) == 1) {
if(verbose) cat("- restoring dimension for Z\n")
if(length(dims) >= 3)
dims <- c(dims[1:2], 1, dims[3:length(dims)])
else
dims <- c(dims, 1)
}
if(length(thisTimeRaw) == 1) {
if(verbose) cat("- adding extra dimension for time\n")
dims <- c(dims, 1)
}
# At this point, we've restored all dimensions dropped due to length 1 issues
# But we want all data moving through RCMIP5 to have four dimensions
if(length(dims) == 1) { # assume time only
if(verbose) cat("- adding extra dimensions for lon, lat, Z\n")
dims <- c(1, 1, 1, dims)
dimNames <- c(NA, NA, NA, dimNames)
} else if(length(dims) == 2) { # assume lon, lat
if(verbose) cat("- adding extra dimensions for Z, time\n")
dims <- c(dims, 1, 1)
dimNames <- c(dimNames, NA, NA)
} else if(length(dims) == 3) { # assume lon, lat, time
if(verbose) cat("- adding extra dimension for Z\n")
dims <- c(dims[1:2], 1, dims[3])
dimNames <- c(dimNames[1:2], NA, dimNames[3])
} else if(length(dims) == 4) { # assume lon, lat, Z, time
# no change needed
} else
stop("Variable dimensions out of bounds!")
list(dims=dims, dimNames=dimNames)
} # restoreMissingDimensions
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.