#' Aggregating timeseries to various resolutions
#'
#' This function allows you to aggregate a data.frame with a column formated as
#' 'Date' or 'POSIXct' to different time resolutions by applying a specific function.
#' @param df data.frame, A data.frame that should be aggregated
#' @param group.cols string, column names that are used to group df. Can be used
#' to aggregate the data.frame into multiple levels for example by specifying the
#' date as group.col and hour as round argument will produce a dataframe with hourly
#' values for every date. If not supplied, no grouping is performed
#' @param value.cols string, one or multiple column names that should be used
#' to calculate the new values. If not supplied, all numeric columns will be used
#' @param round string, one of 'hour', 'date', 'month' or 'season' specifying the time resolution
#' of the output
#' @param fn string, name of the function that is used to aggregate the value.cols. If
#' you want to supply multiple functions, pass them as character vector.
#' @param drop.duplicates logical, Should duplicated entries in the raw data.frame be dropped? The
#' group.cols and the rounded Datetime column will be used to check for duplicates
#' @param group.thresh double, Size in percentage that every group has to have compared
#' to the group with the largest size. Groups smaller than this threshold will be dropped.
#' @param datecol.name string, name of the column that should be used to extract the date,
#' hour or month to aggregate the dataframe. If not supplied, the function will try
#' to automatically detect this column, by looking first for a column in POSIXct format and
#' then for a column in date format.
#' @param na.action string, one of 'keep', 'ignore' or 'fill'. For 'fill', missing values will
#' be filled up to max.gap using zoo::na.approx().
#' @param max.gap integer, number of consecutive nas to fill with zoo::na.approx().
#' @keywords bioclimatic index, linear model
#' @export
#' @examples
#' fit_linear_model_groups(df = df.interp,
#' frml = huglin ~ elevation,
#' predictor.raster = dem.st,
#' file.name = data/huglin.tif,
#' set.zero = T)
aggregate_df <- function(df, value.cols = NULL, round = 'date', fn = 'mean', drop.duplicates = T,
group.thresh = 0.8, datecol = NULL, na.action = 'keep', max.gap = 3, timestep = NULL) {
#Retrieve function that is applied to the Datetime or Date column in df
round.fun <- switch(round,
'hour' = lubridate::hour,
'date' = lubridate::date,
'month' = lubridate::month,
'season' = rebecka::classify_season,
stop('Unknown round operation.
Choose one of: hour, date or month.'))
#if not supplied get name of column in Datetime or Date format
col.classes <- sapply(df, class)
if (is.null(datecol)) {
datecol.string <- rebecka::detect_datecol(df, return = 'name')
if(length(datecol.string) == 0) stop('No column in date or POSIXct format could be detected')
print(paste0('Column: ', datecol.string, ' used for aggregation of the values.'))
} else {
datecol.string <- datecol
}
if (any(is.na(df[ ,datecol.string]))) stop(paste0('NAs in ', datecol.string, ' column'))
datecol.name <- as.name(datecol.string)
#If value.cols is not supplied all numeric columns will be used as value.cols
if (is.null(value.cols)) {
value.cols <- which(sapply(col.classes, function(x) is.element('numeric', x)))
value.cols <- colnames(df)[value.cols]
print((paste0('Column: ', paste0(value.cols, collapse = ', '), ' used as value columns.')))
}
value.cols <- rlang::syms(value.cols)
group.cols <- c('round_col')
if (round == 'hour') group.cols <- c('date', 'round_col')
group.cols <- rlang::syms(group.cols)
df <- arrange(df, !!datecol.name)
#Drop duplicated entries in df based on the datecol and grouping columns
if (drop.duplicates) {
full.rows <- nrow(df)
df <- dplyr::distinct(df, !!datecol.name, .keep_all = T)
unique.rows <- nrow(df)
perc.drop <- round(((full.rows - unique.rows) * 100) / full.rows, 3)
print(paste0(perc.drop, ' % of rows dropped due to being duplicates'))
}
#Remove groups that are too small based on group.thresh
nrows.before <- nrow(df)
df.group_size <- df %>%
dplyr::mutate(round_col = round.fun(!!datecol.name),
date = lubridate::date(!!datecol.name)) %>%
dplyr::group_by(!!!group.cols) %>%
#Calculate group_size and filter only rows above the threshold
dplyr::mutate(group_size = n()) %>%
ungroup() %>%
dplyr::filter(group_size > max(group_size) * group.thresh) %>%
dplyr::select(-date)
nrows.after <- nrow(df.group_size)
perc.drop.group <- round(((nrows.before - nrows.after) * 100) / nrows.before, 3)
print(paste0(perc.drop.group, ' % of rows dropped due to small group size'))
#Generate a df with a complete timeseries where days with missing values are present as rows with NA
unit <- switch(round, 'date' = 'year', 'hour' = 'day')
df.complete <- rebecka::complete_dateseq(df.group_size,
datecol.name = datecol.string,
timestep = timestep,
verbose = T,
unit = unit) %>%
mutate(round_col = round.fun(!!datecol.name))
#Optionally fill the NAs up to max.gap, or ignor the NAs or keep them until the final result
if (na.action == 'fill') {
df.complete <- df.complete %>%
dplyr::mutate_at(dplyr::vars(!!!value.cols), function(x) {
if (all(is.na(x))) {
return(NA_integer_)
} else {
filled <- zoo::na.approx(x, maxgap = max.gap, na.rm = F)
return(filled)
}
})
na.bool <- FALSE
} else if (na.action == 'ignore') {
na.bool <- TRUE
} else {
na.bool <- FALSE
}
#Summarise the value.cols with the specified functions
df.agg <- df.complete %>%
dplyr::mutate(date = lubridate::date(!!datecol.name)) %>%
dplyr::group_by(!!!group.cols) %>%
dplyr::summarise_at(dplyr::vars(!!!value.cols), dplyr::funs_(fn), na.rm = na.bool) %>%
dplyr::ungroup() %>%
dplyr::arrange(round_col) %>%
dplyr::rename(!!round := round_col)
return(df.agg)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.