#' Tools for Manipulating and Summarizing CMIP5 Data
#'
#' Working with CMIP5 data can be tricky, forcing scientists to
#' write custom scripts and programs. The `RCMIP5` package aims
#' to ease this process, providing a standard, robust, and
#' high-performance set of functions to (i) explore what data
#' have been downloaded, (ii) identify missing data, (iii)
#' average (or apply other mathematical operations) across
#' experimental ensembles, (iv) produce both temporal and spatial
#' statistical summaries, and (v) produce
#' easy-to-work-with graphical and data summaries.
#'
#' ...
#'
#' @references Todd-Brown and Bond-Lamberty, 2014: (in prep).
#' @references Taylor et al., 2012:
#' An overview of CMIP5 and the experiment design, Bulletin of the American
#' Meteorological Society, 93, 485-498.
#' \url{http://dx.doi.org/10.1175/BAMS-D-11-00094.1}
#' @import dplyr digest abind assertthat Matrix
#' @docType package
#' @name RCMIP5
NULL
#' The 'cmip5data' class
#'
#' This constructor has two functions. First, given a list, it makes the list
#' a cmip5data-class object (no check is made that the list has appropriate
#' fields though). Second, if given a numeric value(s), it returns sample/
#' example data in the newly constructed object. This is used extensively by
#' the testing code.
#'
#' @param x A list or numeric. If x is a list then the fields are expected to match those of the returned cmip5data object. If x is a numeric sample data is created where the numeric indicates the years of sample data to return.
#' @param lonlat Boolean indicating whether to create lon and lat dimensions
#' @param lonsize Integer size of longitude dimension
#' @param latsize Integer size of latitude dimension
#' @param Z logical. Create Z dimension?
#' @param Zsize integer. Size of Z dimension
#' @param time logical. Create time dimension?
#' @param monthly logical. Monthly (if not, annual) data?
#' @param randomize logical. Random sample data?
#' @param irregular logical. Irregular lon/lat grid?
#' @param verbose logical. Print info as we go?
#' @param loadAs a string identifying possible structures for values. Currently: 'data.frame' and 'array' the only valid options.
#' @return A cmip5data object, which is a list with the following fields:
#' \item{files}{Array of strings containg the file(s) included in this dataset}
#' \item{variable}{String containg the variable name described by this dataset}
#' \item{model}{String containing the model name of this dataset}
#' \item{experiment}{String containing the experiment name of this dataset}
#' \item{ensembles}{Array of strings containg the ensemble(s) included in this dataset}
#' \item{domain}{String containing the domain name of this dataset}
#' \item{val}{Data frame holding data, with fields lon, lat, Z, time}
#' \item{valUnit}{String containing the value units}
#' \item{lon}{Numeric vector containing longitude values; may be \code{NULL}}
#' \item{lat}{Numeric vector containing latitude values; may be \code{NULL}}
#' \item{Z}{Numeric vector Z values; may be \code{NULL}}
#' \item{time}{Numeric vector containing time values; may be \code{NULL}}
#' \item{dimNames}{Array of strings containing the original (NetCDF) dimension names}
#' \item{calendarStr}{String defining the calendar type; may be \code{NULL}}
#' \item{debug}{List with additional data (subject to change)}
#' \item{provenance}{Data frame with the object's provenance. See \code{\link{addProvenance}}}
#' \item{numPerYear}{Numeric vector; only present after \code{\link{makeAnnualStat}}}
#' \item{numYears}{Numeric vector; only present after \code{\link{makeMonthlyStat}}}
#' \item{numCells}{Numeric vector; only present after \code{\link{makeGlobalStat}}}
#' \item{filtered}{Logical; only present after \code{\link{filterDimensions}}}
#' @docType class
#' @examples
#' cmip5data(1970) # produces monthly sample data for year 1970
#' cmip5data(1970:2014)
#' cmip5data(1970:2014, monthly=FALSE) # annual data
#' cmip5data(1970:2014, randomize=TRUE) # randomized data
#' cmip5data(1970:2014, Z=TRUE) # four-dimensional data
#' cmip5data(0, time=FALSE) # sample 'fx' data, two-dimensional
#' cmip5data(list()) # makes this (here empty) list class into 'cmip5data'
#' @export
cmip5data <- function(x=list(),
# parameters for making sample data
lonlat=TRUE, lonsize=10, latsize=10,
Z=FALSE, Zsize=5,
time=TRUE, monthly=TRUE,
randomize=FALSE, irregular=FALSE,
verbose=FALSE, loadAs='data.frame') {
# Sanity checks
assert_that(is.flag(lonlat))
assert_that(is.numeric(lonsize))
assert_that(is.numeric(latsize))
assert_that(is.flag(Z))
assert_that(is.numeric(Zsize))
assert_that(is.flag(time))
assert_that(is.flag(monthly))
assert_that(is.flag(randomize))
assert_that(is.flag(irregular))
assert_that(is.flag(verbose))
assert_that(loadAs %in% c("data.frame", "array"))
if (is.list(x)) { # If x is a list then cast it directly to a cmip5data object
if(verbose) cat("Casting list to cmip5data\n")
structure(x, class="cmip5data")
} else if(is.numeric(x)) { # Create sample data
if(verbose) cat("Creating new cmip5data\n")
# Construct two lists which will be used to create the sample data:
# ... result and debug.
# result holds the primary data of interest
result <- list(
files=NULL,
variable="var",
model="model",
experiment="experiment",
ensembles="ensemble",
domain="domain",
val=NULL,
valUnit=NULL,
lon=NULL,
lat=NULL,
Z=NULL,
time=NULL,
dimNames=NULL
)
debug <- list()
# If this data will have spatial dimensions, construct
if(lonlat) {
if(verbose) cat("Adding spatial dimensions\n")
# realistic lon (0 to 360) and lat (-90 to 90) numbers
result$lon <- 360/lonsize * c(0:(lonsize-1)) + 360/lonsize/2
result$lat <- 180/latsize * c(0:(latsize-1)) - 90 + 180/latsize/2
# Convert to two dimensions
result$lon <- array(result$lon, dim=c(lonsize, latsize))
result$lat <- array(rep(result$lat, 1, each=lonsize), dim=c(lonsize, latsize))
if(irregular) {
result$lon <- result$lon + round(runif(lonsize * latsize), 1)
result$lat <- result$lat + round(runif(lonsize * latsize), 1)
}
result$dimNames=c("lon", "lat")
debug$lonUnit <- "degrees_east"
debug$latUnit <- "degrees_north"
} else {
result$dimNames <- c(NA, NA)
}
# If this data will have Z dimension, construct
if(Z) {
if(verbose) cat("Adding Z dimensions\n")
result$Z <- c(0:(Zsize-1))
result$dimNames <- c(result$dimNames, "Z")
debug$ZUnit <- "m"
} else {
result$dimNames <- c(result$dimNames, NA)
}
# If this data will have time dimension, construct
if(time) {
if(verbose) cat("Adding time dimensions\n")
years <- x
ppy <- ifelse(monthly, 12, 1) # periods per year
result$calendarStr <- "360_day"
debug$timeFreqStr <- ifelse(monthly, "mon", "yr")
debug$startYr <- years[1]
debug$calendarStr <- "360_day"
debug$timeUnit <- paste0("days since ",years[1],"-01-01")
if(monthly) {
# '+15' initalizes all time stamps to be middle of the month
debug$timeRaw <- (360/ppy*c(0:(length(years)*ppy-1) )+15)
result$time <- debug$timeRaw/360+min(years)
} else {
debug$timeRaw <- result$time <- years
}
# convert day based calandar to year based
result$dimNames <- c(result$dimNames, "time")
} else { # no time
result$dimNames <- c(result$dimNames, NA)
result$domain <- "fx"
}
# Create the data array
finalDim <- c(max(1, length(result$lon[,1])),
max(1, length(result$lat[1,])),
max(1, length(result$Z)),
max(1, length(result$time)))
if(randomize) {
result$val <- runif(n=prod(finalDim))
} else {
result$val <- rep(1, prod(finalDim))
}
dim(result$val) <- finalDim
# Miscellany
result$valUnit <- "unit"
result$debug <- debug
# Add debug info and set class
result <- structure(result, class="cmip5data")
# Convert to data frame representation, if requested
if(loadAs == 'data.frame') {
result$val <- convert_array_to_df(result, verbose)
}
# Initialize provenance and return
addProvenance(result, "Dummy data created")
} else {
stop("Don't know what to do with this class of parameter")
}
}
#' Print a 'cmip5data' class object.
#'
#' @param x A \code{\link{cmip5data}} object
#' @param ... Other parameters passed to cat
#' @details Prints a one-line summary of the object
#' @method print cmip5data
#' @export
#' @keywords internal
print.cmip5data <- function(x, ...) {
if(all(unlist(lapply(x, is.null)))) {
cat("(Empty cmip5data object)")
return()
}
ansStr <- paste0('CMIP5: ', x$variable, ", ", x$model, " ", x$experiment)
spaceStr <- paste(dim(x$lon)[1], "x", dim(x$lat)[2], "x", length(x$Z), "x", length(x$time))
ansStr <- paste0(ansStr, ", ", spaceStr)
timeStr <- "no time"
if(!is.null(x$time) & length(x$time) > 0) {
timeStr <- paste(floor(min(x$time, na.rm=TRUE)), "to",
floor(max(x$time, na.rm=TRUE)))
}
ansStr <- paste0(ansStr, ", ", timeStr)
if(!is.null(x$ensembles)) {
ansStr <- paste0(ansStr, ", from ", length(x$ensembles), " ",
ifelse(length(x$ensembles)==1, "ensemble", "ensembles"))
}
cat(ansStr, "\n")
cat(...)
} # print.cmip5data
#' Summarize a 'cmip5data' class object.
#'
#' @param object A \code{\link{cmip5data}} object
#' @param ... ignored
#' @details Prints a short summary of the object.
#' @return A summary structure of the object.
#' @method summary cmip5data
#' @export
#' @keywords internal
summary.cmip5data <- function(object, ...) {
ans <- list()
class(ans) <- "summary.cmip5data"
# cmip5 objects should always have the following defined:
ans$variable <- object$variable
ans$valUnit <- object$valUnit
ans$domain <- object$domain
ans$model <- object$model
ans$experiment <- object$experiment
ans$ensembles <- object$ensembles
ans$type <- "CMIP5 data"
# if(grepl('makeAnnualStat', rev(object$provenance$caller)[1])) {
# ans$type <- paste(ans$type, "annual summary [",
# paste(unique(object$numPerYear), collapse=', '),
# "]")
# }
if(!is.null(object$numPerYear)) {
ans$type <- paste0(ans$type, " (annual summary of ", mean(object$numPerYear), " times)")
}
if(!is.null(object$numYears)) {
ans$type <- paste0(ans$type, " (monthly summary of ", mean(object$numYears), " years)")
}
if(!is.null(object$numCells)) {
ans$type <- paste0(ans$type, " (spatial summary of ", object$numCells, " cells)")
}
if(!is.null(object$numZs)) {
ans$type <- paste0(ans$type, " (Z summary of ", object$numZs, " levels)")
}
if(!is.null(object$filtered)) {
ans$type <- paste(ans$type, "(filtered)")
}
# if(!is.null(object$area)) {
# ans$type <- paste(ans$type, "(regridded)")
# }
ans$spatial <- paste0("lon [", dim(object$lon)[1],
"] lat [", dim(object$lat)[2],
"] Z [", length(object$Z), "]")
ans$time <- paste0(object$debug$timeFreqStr, " [", length(object$time), "] ", object$debug$timeUnit)
ans$size <- as.numeric(object.size(object))
if(is.array(object$val)) {
ans$valsummary <- c(min(object$val, na.rm=TRUE),
mean(object$val, na.rm=TRUE),
max(object$val, na.rm=TRUE))
} else {
ans$valsummary <- c(min(object$val$value, na.rm=TRUE),
mean(object$val$value, na.rm=TRUE),
max(object$val$value, na.rm=TRUE))
#} else {
# stop('Class of value is not recognized.')
}
ans$provenance <- object$provenance
return(ans)
} # summary.cmip5data
#' Print the summary for a 'cmip5data' class object.
#'
#' @param x A \code{\link{cmip5data}} object
#' @param ... Other parameters passed to cat
#' @details Prints a one-line summary of the object
#' @method print summary.cmip5data
#' @export
#' @keywords internal
print.summary.cmip5data <- function(x, ...) {
cat(x$type, "\n")
cat("Variable: ", x$variable, " (", x$valUnit, ") from model ", x$model, "\n", sep="")
cat(sprintf("Data range: %.2g to %.2g Mean: %.2g\n", x$valsummary[1], x$valsummary[3], x$valsummary[2]))
cat("Experiment:", x$experiment, "-", length(x$ensembles), "ensemble(s)\n")
cat("Spatial dimensions:", x$spatial, "\n")
cat("Time dimension:", x$time, "\n")
cat("Size:", format(round(x$size/1024/1024, 1), nsmall=1), "MB\n")
cat("Provenance has", nrow(x$provenance), "entries\n")
} # print.summary.cmip5data
#' Convert a cmip5data object to a data frame
#'
#' @param x A \code{\link{cmip5data}} object
#' @param ... Other parameters
#' @param originalNames logical. Use original dimension names from file?
#' @return The object converted to a data frame
#' @method as.data.frame cmip5data
#' @export
#' @keywords internal
as.data.frame.cmip5data <- function(x, ..., originalNames=FALSE) {
# Sanity checks
assert_that(is.flag(originalNames))
if(is.array(x$val)) {
convert_array_to_df(x)
} else {
# Suppress stupid NOTEs from R CMD CHECK
lon <- lat <- Z <- time <- NULL
dplyr::arrange(x$val, lon, lat, Z, time)
}
} # as.data.frame.cmip5data
#' Convert a cmip5data object to an array
#'
#' @param x A \code{\link{cmip5data}} object
#' @param ... Other parameters
#' @param drop logical. Drop degenerate dimensions?
#' @return The object converted to an array
#' @method as.array cmip5data
#' @export
#' @keywords internal
as.array.cmip5data <- function(x, ..., drop=TRUE) {
# Sanity checks
assert_that(is.flag(drop))
if(is.array(x$val)) {
adrop(x$val, drop=drop)
} else {
dimList <- c(length(unique(x$val$lon)),
length(unique(x$val$lat)),
length(unique(x$val$Z)),
length(unique(x$val$time)))
# Remove degenerate dimensions
if(drop) {
dimList <- dimList[!dimList %in% 1]
}
# Suppress stupid NOTEs from R CMD CHECK
lon <- lat <- Z <- time <- NULL
# Note we sort data frame before converting to array!
array(dplyr::arrange(x$val, time, Z, lat, lon)$value, dim=dimList)
}
} # as.array.cmip5data
#' Make package datasets and write them to disk.
#'
#' @param path root of directory tree
#' @param maxSize max size (in MB) of dataset to write
#' @param outpath directory to write to
#' @details Writes all available ensembles to disk as Rdata files, subject to
#' a maximum size parameter (CRAN says keep sample data < 5MB).
#' @note This is an internal RCMIP5 function and not exported.
#' @keywords internal
makePackageData <- function(path="./sampledata", maxSize=Inf, outpath="./data") {
# Sanity checks
assert_that(is.dir(path))
assert_that(is.readable(path))
assert_that(is.numeric(maxSize))
if(!file.exists(outpath)) dir.create(outpath)
assert_that(is.dir(outpath))
assert_that(is.writeable(outpath))
datasets <- getFileInfo(path)
if(is.null(datasets)) return()
for(i in 1:nrow(datasets)) {
cat("-----------------------\n", datasets[i, "filename"], "\n")
d <- with(datasets[i,],
loadEnsemble(variable, model, experiment, ensemble, path=path, verbose=T)
)
print(object.size(d), units="MB")
if(object.size(d)/1024/1024 <= maxSize) {
objname <- gsub("_[0-9]{4,}-[0-9]{4,}.nc$", "", basename(d$files[1])) # strip dates
assign(objname, d)
cat("Writing", objname, "\n")
save(list=objname, file=paste0(outpath, "/", objname, ".rda"))
} else {
cat("Too big; skipping\n")
}
}
} # makePackageData
#' Alternative weighted mean
#'
#' @param x vector of data
#' @param w vector of weights
#' @param na.rm Remove NAs in both data and weights?
#' @return Weighted mean of \code{x}, using weights \code{d}.
#' @details The \code{stats} version of weighted.mean doesn't handle weights in a very
#' useful way. Specifically, it will remove missing \code{x} values, but not missing weights.
#' A fair number of CMIP5 models have \code{NA} values in their grid areas, so this will
#' quickly cause problems. This function removes (if \code{na.rm=TRUE}) missing observations
#' and weights before computing the weighted mean.
#' @export
#' @seealso \code{\link{weighted.mean}}
cmip5.weighted.mean <- function(x, w=rep(1, length(x)), na.rm=TRUE) {
assert_that(length(x) == length(w))
na <- FALSE
if(na.rm) na <- is.na(x) | is.na(w)
sum((x*w)[!na]) / sum(w[!na])
} # cmip5.weighted.mean
#' Return data values
#'
#' @param x A \code{\link{cmip5data}} object
#' @return Data values in \code{x}.
#' @details Abstracts away getting values, the method for which
#' varies depending on backend implementation.
#' @keywords internal
#' @note This is an internal RCMIP5 function and not exported.
vals <- function(x) {
assert_that(class(x)=="cmip5data")
if(is.data.frame(x$val)) {
x$val$value
} else if(is.array(x$val)) {
as.numeric(x$val)
} else
stop("Unknown data implementation")
} # vals
#' Return number of data values
#'
#' @param x A \code{\link{cmip5data}} object
#' @return Number of data values in \code{x}.
#' @details Abstracts away getting number of values, the method for which
#' varies depending on backend implementation.
#' @keywords internal
#' @note This is an internal RCMIP5 function and not exported.
nvals <- function(x) {
length(vals(x))
} # nvals
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.