#' Compute annual statistic of a variable
#'
#' Most CMIP5 data are monthly, and we frequently want to summarize these to annual
#' numbers. This function does that (although annual files also occur, and will be
#' handled as well). The default statistic is \link{mean}, but any summary
#' function that returns a numeric result can be used.
#'
#' @param x A \code{\link{cmip5data}} object
#' @param verbose logical. Print info as we go?
#' @param sortData logical. Sort \code{x} and \code{area} before computing?
#' @param filterNum logical. Only keep the years which share the most common count.
#' For example, only keep years with 12 months and discard those with 11 or fewer.
#' @param FUN function. Function to apply across months of year
#' @param ... Other arguments passed on to \code{FUN}
#' @return A \code{\link{cmip5data}} object, whose \code{val} field is the annual
#' mean of the variable. A \code{numMonths} field is also added
#' recording the number of months averaged for each year.
#' @details The stat function is calculated for all combinations of lon,
#' lat, and Z (if present).
#' @note If \code{x} is not in a needed order (for example, \code{FUN} uses
#' weights in a different order), be sure to specify \code{sortData=TRUE}.
#' @examples
#' d <- cmip5data(1970:1975) # sample data
#' makeAnnualStat(d)
#' summary(makeAnnualStat(d))
#' summary(makeAnnualStat(d, FUN=sd))
#' @seealso \code{\link{makeZStat}} \code{\link{makeGlobalStat}} \code{\link{makeMonthlyStat}}
#' @export
makeAnnualStat <- function(x, verbose=FALSE, sortData=FALSE, filterNum=TRUE, FUN=mean, ...) {
# Sanity checks
assert_that(class(x)=="cmip5data")
assert_that(is.flag(verbose))
assert_that(is.flag(sortData))
assert_that(is.function(FUN))
# Main computation code
timer <- system.time({ # time the main computation, below
if(is.array(x$val)) {
rawYrs <- table(floor(x$time))
yrs <- as.numeric(names(rawYrs))
mostCommonCount <- as.numeric(names(table(rawYrs))[table(rawYrs) == max(table(rawYrs))])
if(filterNum) {
if(verbose) {
print('Filtering based on number in annual aggregation: ')
print(rawYrs)
print('count required: ')
print(mostCommonCount)
}
yrs <- as.numeric(names(rawYrs)[rawYrs == mostCommonCount])
if(verbose) {
print('new years:')
print(yrs)
}
}
myDim <- dim(x$val)
x$val <- vapply(yrs,
FUN=function(yrNum, ...) {
temp <- x$val[,,,yrNum == floor(x$time)]
myDim[4] <- sum(yrNum == floor(x$time))
dim(temp) <- myDim
temp <- apply(temp, c(1,2,3), FUN, ...)
myDim[4] <- 1
dim(temp) <- myDim
return(temp)},
FUN.VALUE=x$val[,,,1], ...)
myDim[4] <- length(yrs)
dim(x$val) <- myDim
x$time <- yrs
x$numPerYear <- as.integer(rawYrs[as.numeric(names(rawYrs)) %in% yrs])
x$debug$AnnualFreqTable <- rawYrs
} else {
# Suppress stupid NOTEs from R CMD CHECK
lon <- lat <- Z <- time <- year <- value <- `.` <- NULL
# Put data in consistent order BEFORE overwriting time
if(sortData) {
if(verbose) cat("Sorting data...\n")
x$val <- group_by(x$val, lon, lat, Z, time) %>%
arrange()
}
# Instead of "summarise(value=FUN(value, ...))", we use the do()
# call below, because the former doesn't work (as of dplyr 0.3.0.9000):
# the ellipses cause big problems. This solution thanks to Dennis
# Murphy on the manipulatr listesrv.
x$val <- x$val %>%
# start by taking spatial mean, in case there are multiple data per spatial point
# looking at you, IPSL-CM5A-MR
group_by(lon, lat, Z, time) %>%
summarise(value=mean(value, na.rm=TRUE)) %>%
# now move on to year summary
mutate(year = floor(time)) %>%
group_by(lon, lat, Z, year) %>%
do(data.frame(value = FUN(.$value, ...),
counts = length(.$value))) %>%
ungroup()
freqTable <- unique(x$val[,c('year', 'counts')])
assert_that(max(freqTable$counts) <= 12) # shouldn't be more than 12 months per year
if(filterNum) {
if(verbose) {
print('Filtering based on number in annual aggregation: ')
print(freqTable)
print('number required: ')
print(freqTable$year[which.max(freqTable$counts)] )
}
x$val <- x$val[x$val$counts == max(freqTable$counts),]
x$numPerYear <- freqTable$counts[freqTable$counts == max(freqTable$counts)]
} else {
x$numPerYear <- freqTable$counts[order(freqTable$year)]
}
x$val <- x$val[c('lon', 'lat', 'Z', 'year', 'value')]
x$val$time <- x$val$year
x$val$year <- NULL
if(verbose) {
print(head(x$val))
}
# dplyr doesn't (yet) have a 'drop=FALSE' option, and the summarise
# command above may have removed some lon/lat combinations
if(length(unique(x$val$lon)) < length(unique(as.numeric(x$lon))) |
length(unique(x$val$lat)) < length(unique(as.numeric(x$lat)))) {
if(verbose) cat("Replacing missing lon/lat combinations\n")
# Fix this by generating all lon/lat pairs and combining with answer
full_data <- tbl_df(data.frame(lon=as.vector(x$lon), lat=as.vector(x$lat)))
x$val <- left_join(full_data, x$val, by=c("lon", "lat"))
}
x$time <- sort(freqTable$year)
x$debug$AnnualFreqTable <- freqTable
}
}) # system.time
if(verbose) cat('\nTook', timer[3], 's\n')
x$debug$timeFreqStr <- "years (summarized)"
addProvenance(x, paste("Calculated",
paste(deparse(substitute(FUN)), collapse="; "),
"for years", min(x$time), "-", max(x$time)))
} # makeAnnualStat
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.