rebecka_package: Utility functions used in the REBECKA project

Documented in aggregate_df

#' Aggregating timeseries to various resolutions
#'
#' This function allows you to aggregate a data.frame with a column formated as
#' 'Date' or 'POSIXct' to different time resolutions by applying a specific function.
#' @param df data.frame, A data.frame that should be aggregated
#' @param group.cols string, column names that are used to group df. Can be used
#' to aggregate the data.frame into multiple levels for example by specifying the
#' date as group.col and hour as round argument will produce a dataframe with hourly
#' values for every date. If not supplied, no grouping is performed
#' @param value.cols string, one or multiple column names that should be used
#' to calculate the new values. If not supplied, all numeric columns will be used
#' @param round string, one of 'hour', 'date', 'month' or 'season' specifying the time resolution
#' of the output
#' @param fn string, name of the function that is used to aggregate the value.cols. If
#' you want to supply multiple functions, pass them as character vector.
#' @param drop.duplicates logical, Should duplicated entries in the raw data.frame be dropped? The
#' group.cols and the rounded Datetime column will be used to check for duplicates
#' @param group.thresh double, Size in percentage that every group has to have compared
#' to the group with the largest size. Groups smaller than this threshold will be dropped.
#' @param datecol.name string, name of the column that should be used to extract the date,
#' hour or month to aggregate the dataframe. If not supplied, the function will try
#' to automatically detect this column, by looking first for a column in POSIXct format and
#' then for a column in date format.
#' @param na.action string, one of 'keep', 'ignore' or 'fill'. For 'fill', missing values will
#' be filled up to max.gap using zoo::na.approx().
#' @param max.gap integer, number of consecutive nas to fill with zoo::na.approx().
#' @keywords bioclimatic index, linear model
#' @export
#' @examples
#' fit_linear_model_groups(df = df.interp,
#'   frml = huglin ~ elevation,
#'   predictor.raster = dem.st,
#'   file.name = data/huglin.tif,
#'   set.zero = T)

aggregate_df <- function(df, value.cols = NULL, round = 'date', fn = 'mean', drop.duplicates = T,
                         group.thresh = 0.8, datecol = NULL, na.action = 'keep', max.gap = 3, timestep = NULL) {

  #Retrieve function that is applied to the Datetime or Date column in df
  round.fun <- switch(round,
                      'hour' = lubridate::hour,
                      'date' = lubridate::date,
                      'month' = lubridate::month,
                      'season' = rebecka::classify_season,
                      stop('Unknown round operation.
                           Choose one of: hour, date or month.'))

  #if not supplied get name of column in Datetime or Date format
  col.classes <- sapply(df, class)
  if (is.null(datecol)) {

    datecol.string <- rebecka::detect_datecol(df, return = 'name')

    if(length(datecol.string) == 0) stop('No column in date or POSIXct format could be detected')

    print(paste0('Column: ', datecol.string, ' used for aggregation of the values.'))

  } else {

    datecol.string <- datecol

  }

  if (any(is.na(df[ ,datecol.string]))) stop(paste0('NAs in ', datecol.string, ' column'))

  datecol.name <- as.name(datecol.string)

  #If value.cols is not supplied all numeric columns will be used as value.cols
  if (is.null(value.cols)) {

    value.cols <- which(sapply(col.classes, function(x) is.element('numeric', x)))
    value.cols <- colnames(df)[value.cols]

    print((paste0('Column: ', paste0(value.cols, collapse = ', '), ' used as value columns.')))

  }

  value.cols <- rlang::syms(value.cols)

  group.cols <- c('round_col')
  if (round == 'hour') group.cols <- c('date', 'round_col')
  group.cols <- rlang::syms(group.cols)

  df <- arrange(df, !!datecol.name)

  #Drop duplicated entries in df based on the datecol and grouping columns
  if (drop.duplicates) {

    full.rows <- nrow(df)

    df <- dplyr::distinct(df, !!datecol.name, .keep_all = T)

    unique.rows <- nrow(df)

    perc.drop <- round(((full.rows - unique.rows) * 100) / full.rows, 3)
    print(paste0(perc.drop, ' % of rows dropped due to being duplicates'))

  }

  #Remove groups that are too small based on group.thresh
  nrows.before <- nrow(df)
  df.group_size <- df %>%

    dplyr::mutate(round_col = round.fun(!!datecol.name),
                  date = lubridate::date(!!datecol.name)) %>%

    dplyr::group_by(!!!group.cols) %>%

    #Calculate group_size and filter only rows above the threshold
    dplyr::mutate(group_size = n()) %>%
    ungroup() %>%
    dplyr::filter(group_size > max(group_size) * group.thresh) %>%

    dplyr::select(-date)

  nrows.after <- nrow(df.group_size)
  perc.drop.group <- round(((nrows.before - nrows.after) * 100) / nrows.before, 3)
  print(paste0(perc.drop.group, ' % of rows dropped due to small group size'))

  #Generate a df with a complete timeseries where days with missing values are present as rows with NA
  unit <- switch(round, 'date' = 'year', 'hour' = 'day')
  df.complete <- rebecka::complete_dateseq(df.group_size,
                                           datecol.name = datecol.string,
                                           timestep = timestep,
                                           verbose = T,
                                           unit = unit) %>%

    mutate(round_col = round.fun(!!datecol.name))

  #Optionally fill the NAs up to max.gap, or ignor the NAs or keep them until the final result
  if (na.action == 'fill') {

    df.complete <- df.complete %>%

      dplyr::mutate_at(dplyr::vars(!!!value.cols), function(x) {
        if (all(is.na(x))) {
          return(NA_integer_)
        } else {
          filled <- zoo::na.approx(x, maxgap = max.gap, na.rm = F)
          return(filled)
        }
      })

    na.bool <- FALSE

  } else if (na.action == 'ignore') {
    na.bool <- TRUE
  } else {
    na.bool <- FALSE
  }

  #Summarise the value.cols with the specified functions
  df.agg <- df.complete %>%

    dplyr::mutate(date = lubridate::date(!!datecol.name)) %>%

    dplyr::group_by(!!!group.cols) %>%

    dplyr::summarise_at(dplyr::vars(!!!value.cols), dplyr::funs_(fn), na.rm = na.bool) %>%

    dplyr::ungroup() %>%

    dplyr::arrange(round_col) %>%
    dplyr::rename(!!round := round_col)

  return(df.agg)

}