R/group_apply.R
In palaeoverse: Prepare and Explore Data for Palaeobiological Analyses

Documented in group_apply

#' Apply a function over grouping(s) of data
#'
#' A function to apply `palaeoverse` functionality across subsets (groups) of
#' data, delineated using one or more variables. Functions which receive a
#' `data.frame` as input (e.g. `nrow`, `ncol`, `lengths`, `unique`) may also be
#' used.
#'
#' @param occdf \code{dataframe}. A dataframe of fossil occurrences or taxa,
#' as relevant to the desired function.
#' This dataframe must contain the grouping variables and the necessary
#' variables for the function you wish to call (see function-specific
#' documentation for required columns).
#' @param group \code{character}. A vector of column names, specifying the
#' desired subgroups (e.g. "collection_no", "stage_bin"). Supplying more than
#' one grouping variable will produce an output containing subgroups for each
#' unique combination of values.
#' @param fun \code{function}. The function you wish to apply to
#' `occdf`. See details for compatible functions.
#' @param ... Additional arguments available in the called
#' function. These arguments may be required for function arguments
#' without default values, or if you wish to overwrite the default argument
#' value (see examples).
#'
#' @return A \code{data.frame} of the outputs from the selected function, with
#' appended column(s) indicating the user-defined groups. If a single vector
#' is returned via the called function, it will be transformed to a
#' \code{data.frame} with the column name equal to the input function.
#'
#' @details `group_apply` applies functions to subgroups of data within a
#' supplied dataset, enabling the separate analysis of occurrences or taxa from
#' different time intervals, spatial regions, or trait values. The function
#' serves as a wrapper around `palaeoverse` functions. Other functions which
#' can be applied to a `data.frame` (e.g. `nrow`, `ncol`, `lengths`,
#' `unique`) may also be used.
#' \cr
#' \cr
#' All `palaeoverse` functions which require a dataframe input can be used in
#' conjunction with the `group_apply` function. However, this is unnecessary
#' for many functions (e.g. \code{\link{bin_time}}) as groups do not need to
#' be partitioned before binning. This list provides
#' users with `palaeoverse` functions that might be interesting to apply across
#' group(s):
#' - \code{\link{tax_unique}}: return the number of unique taxa per grouping
#' variable.
#' - \code{\link{tax_range_time}}: return the temporal range of taxa per
#' grouping variable.
#' - \code{\link{tax_range_space}}: return the geographic range of taxa per
#' grouping variable.
#' - \code{\link{tax_check}}: return potential spelling variations of the
#' same taxon per grouping variable. Note: `verbose` needs to be set to FALSE.
#'
#'
#' @section Developer(s):
#' Lewis A. Jones & William Gearty
#' @section Reviewer(s):
#' Kilian Eichenseer & Bethany Allen
#' @importFrom stats as.formula
#' @examples
#' # Examples
#' # Get tetrapods data
#' occdf <- tetrapods[1:100, ]
#' # Remove NA data
#' occdf <- subset(occdf, !is.na(genus))
#' # Count number of occurrences from each country
#' ex1 <- group_apply(occdf = occdf, group = "cc", fun = nrow)
#' # Unique genera per collection with group_apply and input arguments
#' ex2 <- group_apply(occdf = occdf,
#'                      group = c("collection_no"),
#'                      fun = tax_unique,
#'                      genus = "genus",
#'                      family = "family",
#'                      order = "order",
#'                      class = "class",
#'                      resolution = "genus")
#' # Use multiple variables (number of occurrences per collection and formation)
#' ex3 <- group_apply(occdf = occdf,
#'                    group = c("collection_no", "formation"),
#'                    fun = nrow)
#' # Compute counts of occurrences per latitudinal bin
#' # Set up lat bins
#' bins <- lat_bins_degrees()
#' # bin occurrences
#' occdf <- bin_lat(occdf = occdf, bins = bins)
#' # Calculate number of occurrences per bin
#' ex4 <- group_apply(occdf = occdf, group = "lat_bin", fun = nrow)
#' @export
group_apply <- function(occdf, group, fun, ...) {

  # Handle errors
  if (!is.data.frame(occdf)) {
    stop("`occdf` should be a dataframe")
  }

  if (!any(group %in% colnames(occdf))) {
    stop("Supplied `group` is not a named column in `occdf`")
  }

  if (!is.function(fun)) {
    stop("Supplied `fun` is not a function")
  }

  supp_args <- list(...)
  if (!("..." %in% names(formals(fun)))) {
    indx <- which(!(names(supp_args) %in% names(formals(fun))))
    if (length(indx) > 1) {
      stop(paste(paste0("`", names(supp_args)[indx], "`", collapse = "/"),
                 "are not valid arguments for the specified function"))
    } else if (length(indx) == 1) {
      stop(paste0("`", names(supp_args)[indx], "`",
                  " is not a valid argument for the specified function"))
    }
  }
  # Generate bin codes
  bin_codes <- as.formula(paste0("~ ", paste(group, collapse = " + ")))
  # Split dataframe
  lst <- split(occdf, f = bin_codes, drop = TRUE, sep = "&1*^$0%")
  # Group names
  nme <- names(lst)
  # Split into individual columns
  nme_df <- do.call(rbind.data.frame, strsplit(nme, "&1*^$0%", fixed = TRUE))
  colnames(nme_df) <- group
  # Apply function
  output_lst <- lapply(lst, fun, ...)
  # Function name
  fun_name <- deparse(substitute(fun))
  # Add groupings to output
  output_df <- do.call(rbind.data.frame,
                       lapply(seq_along(output_lst), FUN = function(i) {
    df <- output_lst[[i]]
    if (is.null(df)) return(df)
    if (!is.data.frame(df)) {
      df <- as.data.frame(df)
      colnames(df) <- fun_name
    }
    if (!is.null(nrow(df)) && nrow(df) > 0) {
      nms <- c(colnames(df), colnames(nme_df))
      df <- cbind.data.frame(df, nme_df[i, , drop = TRUE])
      colnames(df) <- nms
      df
    }
  }))
  # Update output if none returned
  if (nrow(output_df) == 0) {
    output_df <- NULL
  }
  # Return output
  return(output_df)
}