#' Aggregates a data frame to a shorter time period
#'
#' @description CRHM data (observations and outputs) are generally produced at hourly time steps. This function aggregates CRHM data to daily, monthly or yearly values. The data can be aggregated by their maxima, minima, means, sums and any combination of these statistics.
#' @param CRHMdataframe Required. A valid \pkg{CRHMr} data frame.
#' @param columns The columns to be aggregated, not including the datetime. The default is the first column.
#' This can be a vector, i.e. c(1,2,3).
#' @param period The period of aggregation. Must be one of \option{hourly}, \option{daily}, \option{monthly}, \option{yearly}
#' (or \option{annual}) or \option{hydro}. Default is \option{yearly}. Multiple-hour aggregation is not yet supported.
#' @param funs A character vector containing the function(s) for aggregation. The default is \option{mean},
#' but can also include \option{min}, \option{max}, \option{sum}, \option{length}, \code{delta}, and \code{first}. The function(s) will be applied to all of the specified columns
#' @param AggFilename Optional. File name for the aggregated data.
#' @param startMonth Optional. Starting month, to be used when aggregating by hydrological year.
#' @param useSecondYear Optional. Logical. Should the hydrological year be based on the first or second calendar year.
#' In other words would January 1, 2015 be the hydrological year 2014 or 2015? The default is \code{TRUE}
#' (i.e., the hydrological year would be 2015). Note that the Campbell Scientific program SPLIT uses the
#' first calendar year (i.e., the hydrological year would be 2014). To emulate this program, set \code{useSecondYear}
#' to be \code{FALSE}.
#' @param omitMissing Optional. If \code{FALSE} missing (i.e. \code{NA_real_}) values will
#' be included in the aggregation, causing the aggregated values to also be \code{NA_real_}, If
#' \code{TRUE}, then the missing values will be omitted from the calculations. This option
#' is only used by the functions \code{max}, \code{min}, \code{mean} and \code{sum}.
#' @param logfile Optional. Name of the file to be used for logging the action. Normally not used.
#' @return Returns a data frame with the aggregated values.
#' @note The period of aggregation must be greater than the time step of the CRHM data.
#' This function does NOT remove \code{NA} values before differencing.
#' @seealso \code{\link{interpolate}} \code{\link{impute}} \code{\link{yearlyPeaks}} \code{\link{hydroYear}}
#' @author Kevin Shook
#' @examples
#' Badlake.t.monthly <- aggDataframe(BadLake7376,
#' period = "monthly",
#' columns = 1, funs = c("mean")
#' )
#' Badlake.radiation.daily <- aggDataframe(BadLake7376,
#' period = "daily",
#' columns = c(6, 7, 8), funs = c("mean")
#' )
#' @importFrom lubridate ceiling_date floor_date tz
#' @importFrom stringr str_detect fixed str_replace str_c
#' @export
aggDataframe <- function(CRHMdataframe, columns = 1, period = "annual", funs = c("mean"),
AggFilename = "", startMonth = 10, useSecondYear = TRUE,
omitMissing = FALSE, logfile = "") {
# aggregates a dataframe of CRHM variables (obs or outputs) to longer time periods
if (period == "" | nrow(CRHMdataframe) == 0 | (length(columns) == 0)) {
cat("Error: missing variables\n")
return(FALSE)
}
CRHMname <- deparse(substitute(CRHMdataframe))
if (length(columns) > 1) {
selected <- CRHMdataframe[, (columns + 1)]
selected.names <- names(selected)
}
else {
selected <- data.frame(CRHMdataframe[, (columns + 1)])
selected.names <- names(CRHMdataframe)[columns + 1]
names(selected) <- selected.names
}
if (is.numeric(period)) {
# do sub-daily aggregation
period.hours <- period
if (period == 1) {
time.period <- "hour"
times <- ceiling_date(CRHMdataframe$datetime, unit = time.period)
agg <- data.frame(unique(times))
names(agg) <- time.period
}
else {
cat("Error: multiple-hour aggregation not yet supported\n")
return(FALSE)
}
}
else {
period <- tolower(period)
if (str_detect(period, "ann") | str_detect(period, "year")) {
time.period <- "year"
period.hours <- 365 * 24
}
else if (str_detect(period, "da")) {
time.period <- "day"
period.hours <- 24
}
else if (str_detect(period, "mo") & !str_detect(period, "year")) {
time.period <- "month"
period.hours <- 30 * 24
}
else if (str_detect(period, "ho")) {
time.period <- "hour"
period.hours <- 1
}
else if (str_detect(period, "hy")) {
time.period <- "hydrologic-year"
period.hours <- 365 * 24
}
else {
time.period <- "year"
period.hours <- 365 * 24
}
# set up time step for multi-day aggregation
if (time.period == "hydrologic-year") {
hYear <- hydroYear(CRHMdataframe, startMonth, useSecondYear)
times <- hYear
agg <- data.frame(unique(hYear))
if (useSecondYear) {
names(agg) <- "hydrological_year_second"
} else {
names(agg) <- "hydrological_year_first"
}
}
else if (time.period == "hour") {
times <- ceiling_date(CRHMdataframe$datetime, unit = time.period)
agg <- data.frame(unique(times))
names(agg) <- time.period
}
else {
times <- floor_date(CRHMdataframe$datetime, unit = time.period)
agg <- data.frame(unique(times))
names(agg) <- time.period
}
}
# check period vs existing period
current.period <- timestep.hours(CRHMdataframe$datetime[1], CRHMdataframe$datetime[2])
if (current.period >= period.hours) {
cat("Error: cannot aggregate to a shorter time period\n")
return(FALSE)
}
# do aggregation
if (sum(str_detect(funs, "max"))) {
max.vals <- aggregate(selected,
by = list(times), FUN = "max",
na.rm = omitMissing
)
max.names <- names(max.vals)[-1]
max.vals <- data.frame(max.vals[, -1])
max.names <- paste(max.names, ".max", sep = "")
names(max.vals) <- max.names
agg <- cbind(agg, max.vals)
}
if (sum(str_detect(funs, "min"))) {
min.vals <- aggregate(selected,
by = list(times), FUN = "min",
na.rm = omitMissing
)
min.names <- names(min.vals)[-1]
min.vals <- data.frame(min.vals[, -1])
min.names <- paste(min.names, ".min", sep = "")
names(min.vals) <- min.names
agg <- cbind(agg, min.vals)
}
if (sum(str_detect(funs, "mean"))) {
mean.vals <- aggregate(selected,
by = list(times), FUN = "mean",
na.rm = omitMissing
)
mean.names <- names(mean.vals)[-1]
mean.vals <- data.frame(mean.vals[, -1])
mean.names <- paste(mean.names, ".mean", sep = "")
names(mean.vals) <- mean.names
agg <- cbind(agg, mean.vals)
}
if (sum(str_detect(funs, "sum"))) {
sum.vals <- aggregate(selected,
by = list(times), FUN = "sum",
na.rm = omitMissing
)
sum.names <- names(sum.vals)[-1]
sum.vals <- data.frame(sum.vals[, -1])
sum.names <- paste(sum.names, ".sum", sep = "")
names(sum.vals) <- sum.names
agg <- cbind(agg, sum.vals)
}
if (sum(str_detect(funs, "length"))) {
length.vals <- aggregate(selected, by = list(times), FUN = "length")
length.names <- names(sum.vals)[-1]
length.vals <- data.frame(sum.vals[, -1])
length.names <- paste(length.names, ".length", sep = "")
names(length.vals) <- length.names
agg <- cbind(agg, length.vals)
}
if (sum(str_detect(funs, "first"))) {
first_days <- floor_date(CRHMdataframe$datetime, "month")
first.vals <- selected[CRHMdataframe$datetime == first_days, ]
first_dates <- CRHMdataframe$datetime[CRHMdataframe$datetime == first_days]
first.names <- names(selected)
first.vals <- data.frame(first_dates, first.vals)
first.names <- paste(first.names, ".first", sep = "")
names(first.vals) <- c(names(agg)[1], first.names)
agg <- merge(agg, first.vals, all.x = TRUE)
}
if (sum(str_detect(funs, "delta"))) {
vals <- data.frame(CRHMdataframe$datetime, selected)
names(vals)[1] <- "datetime"
# get closest datetime to specified dates/times
if (time.period == "year" |
time.period == "month" |
time.period == "day" |
time.period == "hydrologic-year") {
names(agg)[1] <- "date"
if (time.period == "year") {
agg$date <- as.Date(paste(agg$date, "-01-01", sep = ""),
format = "%Y-%m-%d"
)
} else if (time.period == "hydrologic-year") {
agg$date <- as.Date(paste(agg$date, "-", startMonth, "-01", sep = ""),
format = "%Y-%m-%d"
)
}
timezone <- tz(CRHMdataframe$datetime)
agg_datetime <- dateToDatetime(agg, hour = 23, timezone = timezone)
}
# now merge
merged <- merge(agg_datetime, vals, by = "datetime")
colnums <- ncol(merged) - 1
for (i in 1:colnums) {
delta <- diff(merged[, (i + 1)])
merged[, (i + 1)] <- c(NA_real_, delta)
}
delta.names <- names(vals)[-1]
delta.names <- paste(delta.names, ".delta", sep = "")
names(merged)[-1] <- delta.names
agg <- merged
agg[, 1] <- as.numeric(format(agg[, 1], format = "%Y"))
names(agg)[1] <- "year"
}
# remove double periods from variable names
agg.names <- names(agg)
agg.names <- str_replace(agg.names, fixed(".."), ".")
names(agg) <- agg.names
# format dates
if (time.period == "year") {
agg[, 1] <- as.numeric(format(agg[, 1], format = "%Y"))
} else if (time.period == "month") {
agg[, 1] <- format(agg[, 1], format = "%Y-%m")
} else if (time.period == "day") {
agg[, 1] <- as.Date(agg[, 1])
names(agg)[1] <- "date"
}
else if (time.period == "hour") {
names(agg)[1] <- "datetime"
}
# write to file, if name provided
if (AggFilename != "") {
write.csv(agg, file = AggFilename, row.names = FALSE, col.names = TRUE)
} else {
AggFilename <- "NA"
}
comment <- paste("aggDataframe dataframe:", CRHMname, " Aggfuns:",
str_c(funs, collapse = ","),
" period:", period, " file:", AggFilename,
sep = ""
)
result <- logAction(comment, logfile)
if (result) {
return(agg)
} else {
return(result)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.