R/time_bins.R
In palaeoverse: Prepare and Explore Data for Palaeobiological Analyses

Documented in time_bins

#' Generate time bins
#'
#' A function to generate time bins for a given study interval and geological
#' timescale. This function is flexible in that either stage-level or higher
#' stratigraphic-level (e.g. period) time bins can be called, valid timescales
#' from [Macrostrat](https://macrostrat.org/api/defs/timescales?all) can be
#' used, or a `data.frame` of a geological timescale can be provided. In
#' addition, near equal-length time bins can be generated by grouping
#' intervals together. For example, for a target bin size of 10 Myr, the
#' function will generate groups of bins that have a mean bin length close to
#' 10 Myr. However, users may also want to consider grouping stages based on
#' other reasoning e.g. availability of outcrop (see Dean et al. 2020).
#'
#' @param interval \code{character or numeric}. Interval name available in
#'   \code{\link{GTS2020}} or \code{\link{GTS2012}}. If a single interval name
#'   is provided, this interval is returned. If two interval names are
#'   provided, these intervals and those existing between are returned. If a
#'   single numeric age is provided, the interval that covers this age is
#'   returned. If two numeric ages are provided, the intervals occurring in
#'   the range of these ages are returned. Defaults to "Phanerozoic". This
#'   argument is ignored if `scale` is not "GTS2020" or "GTS2012".
#' @param rank \code{character}. Which stratigraphic rank is desired? Choose
#'   from: "stage", "epoch", "period", "era", and "eon". This argument is
#'   ignored if `scale` is not "GTS2020" or "GTS2012".
#' @param size \code{numeric}. If equal-length time bins are desired, specify
#'   the length in millions of years (Myr) of the time bins desired.
#' @param assign \code{numeric}. A numeric vector of age estimates to use to
#'   assign to requested bins. If assign is specified, a numeric vector is
#'   returned of the midpoint age of the specified bins. Note this is the
#'   simplified approach of assignment in `palaeoverse` included for data with
#'   'known' point-age estimates. For a wider range of binning methods, see
#'   \code{\link[palaeoverse:bin_time]{palaeoverse::bin_time()}}.
#' @param scale \code{character} or \code{data.frame}. Specify the desired
#'   geological timescale to be used: "GTS2020" (default), "GTS2012", a valid
#'   timescale from
#'   [Macrostrat](https://macrostrat.org/api/defs/timescales?all), or
#'   user-input `data.frame`. If a `data.frame` is provided, it must contain
#'   at least the following named columns: "interval_name", "max_ma", and
#'   "min_ma". As such, age data should be provided in Ma.
#' @param plot \code{logical}. Should a plot of time bins be generated?
#' @importFrom graphics polygon title
#' @importFrom stats sd
#' @importFrom curl nslookup
#' @importFrom grDevices col2rgb
#' @importFrom utils read.csv
#'
#' @return A \code{data.frame} of time bins for the specified intervals or a
#'   list with a \code{data.frame} of time bins and a named \code{numeric}
#'   vector (bin number) of binned age estimates (midpoint of specified bins)
#'   if `assign` is specified. By default, the time bins \code{data.frame}
#'   contains the following columns: bin, interval_name, rank, max_ma, mid_ma,
#'   min_ma, duration_myr, abbr (interval abbreviation), colour and font
#'   (colour). If `size` is specified, the time bins \code{data.frame}
#'   contains the following columns: bin, max_ma, mid_ma, min_ma,
#'   duration_myr, grouping_rank, intervals, colour and font.
#'
#' @details This function uses either the Geological Time Scale 2020,
#'   Geological Time Scale 2012, a valid timescale from
#'   [Macrostrat](https://macrostrat.org/api/defs/timescales?all), or a
#'   user-input `data.frame` (see `scale` argument) to generate time bins.
#'   Additional information on included Geological Time Scales and source can
#'   be accessed via:
#'   - \code{\link{GTS2020}}
#'   - \code{\link{GTS2012}}
#'
#'   Available interval names are accessible via the `interval_name` column
#'   in `GTS2012` and `GTS2020`. Data of the Geological Timescale 2020 and
#'   2012 were compiled by Lewis A. Jones (2022-07-02).
#'
#' @section References:
#' Dean, C.D., Chiarenza, A.A. and Maidment, S.C., 2020. Formation binning: a
#' new method for increased temporal resolution in regional studies, applied
#' to the Late Cretaceous dinosaur fossil record of North America.
#' Palaeontology, 63(6), 881-901. \doi{10.1111/pala.12492}.
#'
#' @section Developer(s):
#' Lewis A. Jones
#' @section Reviewer(s):
#' Kilian Eichenseer & William Gearty
#' @export
#' @examples
#' #Using numeric age
#' ex1 <- time_bins(interval = 10, plot = TRUE)
#'
#' #Using numeric age range
#' ex2 <- time_bins(interval = c(50, 100), plot = TRUE)
#'
#' #Using a single interval name
#' ex3 <- time_bins(interval = c("Maastrichtian"), plot = TRUE)
#'
#' #Using a range of intervals and near-equal duration bins
#' ex4 <- time_bins(interval = c("Fortunian", "Meghalayan"),
#'                  size = 10, plot = TRUE)
#'
#' #Assign bins based on given age estimates
#' ex5 <- time_bins(interval = c("Fortunian", "Meghalayan"),
#'                  assign = c(232, 167, 33))
#'
#' #Use user-input data.frame to generate near-equal length bins
#' scale <- data.frame(interval_name = 1:5,
#'                     min_ma = c(0, 18, 32, 38, 45),
#'                     max_ma = c(18, 32, 38, 45, 53))
#' ex6 <- time_bins(scale = scale, size = 20, plot = TRUE)
#'
#' #Use North American land mammal ages from Macrostrat
#' ex7 <- time_bins(scale = "North American land mammal ages", size = 10)
#'
time_bins <- function(interval = "Phanerozoic", rank = "stage", size = NULL,
                      assign = NULL, scale = "GTS2020", plot = FALSE) {
  # Error handling -------------------------------------------------------
  if (!is.character(interval) &&
      !is.numeric(interval) &&
      !is.null(interval)) {
    stop("`interval` must be NULL or of class 'character' or 'numeric'")
  }

  if (!is.numeric(size) && !is.null(size)) {
    stop("`size` should be a 'numeric' or NULL.")
  }

  if (!is.logical(plot)) {
    stop("`plot` should be logical (TRUE/FALSE).")
  }

  if (is.numeric(assign) && any(assign < 0)) {
    stop(paste("Age estimates for `assign` should be non-negative values.",
               "Hint: You can transform your data using abs()."))
  }

  if (!is.character(scale) && !is.data.frame(scale)) {
    stop(paste("`scale` must be either:\n",
               "The name of an in-built time scale (e.g. 'GTS2020'),",
               "the name of a Macrostrat time scale (see details),",
               "or a `data.frame`."))
  }

  if (is.data.frame(scale) &&
      any(!c("interval_name", "max_ma", "min_ma") %in% colnames(scale))) {
    stop(paste("`scale` does not contain named columns:",
    "'interval_name', 'max_ma', and 'min_ma'."))
  }

  if (length(interval) > 2 && !is.null(interval)) {
    stop(paste("`interval` must be a 'character' or",
               "'numeric' vector of length 1 or 2",
               "or NULL."))
  }

  if (length(rank) > 1) {
    stop("`rank` must be of length 1.")
  }

  if (!rank %in% c("stage", "epoch", "period", "era", "eon", NULL)) {
    stop("`rank` must be either: stage, epoch, period, era, eon, or NULL.")
  }

  # Input dataframe scale ------------------------------------------------
  if (is.data.frame(scale)) {
    # Assign input scale to df
    df <- scale
    # Update scale/rank
    scale <- "user"
    rank <- "user"
    # Add mid_ma
    df$mid_ma <- (df$max_ma + df$min_ma) / 2
    # Add duration
    df$duration_myr <- (df$max_ma - df$min_ma)
    # Add bin number
    df$bin <- seq_len(nrow(df))
    # Add scale as rank
    df$rank <- scale
    # Add colour column if it doesn't already exist
    if (!"colour" %in% colnames(df)) {
      df$colour <- "#80cdc1"
    }
    # Add abbr column if it doesn't already exist
    if (!"abbr" %in% colnames(df)) {
      df$abbr <- NA
    }
  }
  # In-built scales ------------------------------------------------------
  if (scale %in% c("GTS2020", "GTS2012")) {
    if (is.null(interval)) {
      stop("`interval` is NULL. You must define an interval/age range.")
    }
    # Which geological timescale to use?
    if (scale == "GTS2020") {
      df <- palaeoverse::GTS2020
    } else if (scale == "GTS2012") {
      df <- palaeoverse::GTS2012
    }
    # Update interval number to bin number
    colnames(df)[which(colnames(df) == "interval_number")] <- "bin"
    # Filter dataset by desired interval names (character string)
    if (is.character(interval)) {
      # Check interval names
      int_index <- charmatch(interval, df$interval_name)
      if (any(is.na(int_index))) {
        stop(paste("Check spelling of specified intervals.",
        "Available intervals are accessible via GTS2020 and GTS2012."))
      }
      # Subset df for intervals
      if (length(int_index) > 1) {
        int_min <- min(c(df$min_ma[int_index[1]], df$min_ma[int_index[2]]))
        int_max <- max(c(df$max_ma[int_index[1]], df$max_ma[int_index[2]]))
        int_index <- c(int_min, int_max)
      } else {
        int_index <- c(df$min_ma[int_index], df$max_ma[int_index])
      }
      # Subset df
      int_index <- which(df$max_ma > min(int_index) &
                         df$min_ma < max(int_index))
    }
    # Filter dataset by desired interval ages (numeric string)
    if (is.numeric(interval)) {
      # Check age range
      if (max(interval) > max(df$max_ma)) {
        stop("maximum `interval` value is greater than available intervals")
      }
      if (min(interval) < min(df$min_ma)) {
        stop("minimum `interval` value is less than available intervals")
      }
      # Subset df
      int_index <- which(df$max_ma >= min(interval) &
                            df$min_ma <= max(interval))
    }
    # Sort int_index
    int_index <- sort(int_index)
    # Subset df for intervals
    df <- df[int_index, ]
    # Which rank should be used?
    df <- df[which(df$rank == rank), ]
    # Error handle
    if (nrow(df) == 0) {
      stop("No intervals are available for the defined interval range.")
    }
  }
  # Macrostrat dataframe scale -------------------------------------------
  if (!scale %in% c("GTS2020", "GTS2012", "user")) {
    # Assign scale to rank
    rank <- scale
    # Try to get the time scale from Macrostrat
    # Online and Macrostrat available?
    tryCatch(
      {
        nslookup("macrostrat.org")
      },
      error = function(e) {
        stop("Macrostrat is not available. Either the site is down or you are
             not connected to the internet.",
             call. = FALSE
        )
      }
    )
    url <- url(paste0("https://macrostrat.org/api/v2/defs/intervals",
                      "?format=csv&timescale=",
                      gsub(" ", "%20", scale)))
    df <- tryCatch(
      {
        read.csv(url, header = TRUE, stringsAsFactors = FALSE)
      },
      error = function(e) {
        stop("`name` does not match a built-in or Macrostrat time scale.",
             call. = FALSE)
      })
    df <- df[, c("name", "b_age", "t_age", "abbrev", "color")]
    colnames(df) <- c("interval_name", "max_ma", "min_ma", "abbr", "colour")
    # Add mid_ma
    df$mid_ma <- (df$max_ma + df$min_ma) / 2
    # Add duration
    df$duration_myr <- (df$max_ma - df$min_ma)
    # Add bin number
    df$bin <- seq_len(nrow(df))
    # Add scale as rank
    df$rank <- scale
  }
  # Add font colour if it doesn't exist
  if (!("font" %in% colnames(df))) {
    # Font colours based on luminance as per
    # https://stackoverflow.com/a/1855903/4660582
    rgbs <- col2rgb(df$colour)
    luminance <- apply(rgbs, 2, function(x) {
      (0.299 * x[1] + 0.587 * x[2] + 0.114 * x[3]) / 255
    })
    df$font <- ifelse(luminance > .5, "black", "white")
  }
  # Tidy dataframe -------------------------------------------------------
  # Abbreviate names if required
  if (any(is.na(df$abbr))) {
    df$abbr <- abbreviate(df$interval_name, minlength = 1,
                          use.classes = FALSE, named = FALSE)
  }
  # Reorder dataframe
  df <- df[, c("bin", "interval_name", "rank", "max_ma", "mid_ma",
               "min_ma", "duration_myr", "abbr", "colour", "font")]
  # Generate equal-length bins? ------------------------------------------
  if (is.numeric(size)) {
      # Update bin size for age range
      # How many bins should be generated?
      n_bins <- round((max(df$max_ma) - min(df$min_ma)) / size)
      size <- (max(df$max_ma) - min(df$min_ma)) / n_bins
      #track cumulative sum
      tracker <- list()
      for (i in seq_len(length.out = nrow(df))) {
        tracker[[i]] <- rep(NA, length.out = nrow(df))
        tracker[[i]][i:nrow(df)] <- abs(
          cumsum(df$duration_myr[i:nrow(df)]) - size)
      }
      # Calculate upper and lower limits for each bin
      lower <- NULL
      upper <- NULL
      count <- 1
      while (count <= nrow(df)) {
        upper <- append(upper, which.min(tracker[[count]]))
        lower <- append(lower, (count))
        count <- which.min(tracker[[count]]) + 1
      }
      # Generate bin information
      bin <- rev(seq_along(upper))
      max_ma <- df[upper, c("max_ma")]
      min_ma <- df[lower, c("min_ma")]
      mid_ma <- (max_ma + min_ma) / 2
      duration_myr <- max_ma - min_ma
      intervals <- vector("character")
      # Resolve any edge effect
      if (duration_myr[length(duration_myr)] < size / 2) {
        upper[length(upper) - 1] <- upper[length(upper)]
        upper <- upper[-length(upper)]
        lower <- lower[-length(lower)]
        bin <- rev(seq_along(upper))
        max_ma <- df[upper, c("max_ma")]
        min_ma <- df[lower, c("min_ma")]
        mid_ma <- (max_ma + min_ma) / 2
        duration_myr <- max_ma - min_ma
      }
      # Get interval names
      for (i in seq_along(upper)) {
        intervals[i] <- toString(df[lower[i]:upper[i], c("interval_name")])
      }
      # Generate dataframe
      grouping_rank <- rank
      df <- cbind.data.frame(bin,
                             max_ma,
                             mid_ma,
                             min_ma,
                             duration_myr,
                             grouping_rank,
                             intervals)
      # Message user
      message(
        paste("Target equal length time bins was set to",
              round(size, digits = 2),
              "Myr. \nGenerated time bins have a mean length of",
              round(mean(df$duration_myr), digits = 2),
              "Myr and a standard deviation of",
              round(sd(df$duration_myr), digits = 2),
              "Myr."
        )
      )
  }
  # Plot data? -----------------------------------------------------------
  if (plot) {
    if (is.numeric(size)) {
      df$colour <- c("#80cdc1")
      df$font <- c("black")
    }
    plot(1, type = "n",
         xlim = c(max(df$max_ma), min(df$min_ma)),
         ylim = c(0, max(df$duration_myr)),
         xlab = "Time (Ma)",
         ylab = "Duration (Myr)"
    )
    for (i in seq_len(length.out = nrow(df))) {
      polygon(
        x = c(df$min_ma[i], df$max_ma[i], df$max_ma[i], df$min_ma[i]),
        y = c(0, 0, df$duration_myr[i], df$duration_myr[i]),
        col = df$colour[i]
      )
    }
    if (is.numeric(size)) {
      title(paste(
        "Mean bin length =",
        round(mean(df$duration_myr), digits = 2),
        "(standard deviation =",
        round(sd(df$duration_myr), digits = 2),
        ")"
      ))
    }
  }
  # Assign data? ---------------------------------------------------------
  if (!is.null(assign)) {
    if (is.numeric(assign)) {
      if (any(assign > max(df$max_ma) | assign < min(df$min_ma))) {
        stop("One or more ages is outside the specified time interval range")
      }
      tmp <- assign
      for (i in seq_len(length.out = nrow(df))) {
        assign[which(tmp <= df$max_ma[i] &
                       tmp >= df$min_ma[i])] <- df$mid_ma[i]
        names(assign)[which(tmp <= df$max_ma[i] &
                              tmp >= df$min_ma[i])] <- df$bin[i]
      }
      assign <- list(df, assign)
      names(assign) <- c("Bins", "Assignation")
      return(assign)
    } else {
      stop("`assign` should be a numeric")
    }
  }
  # Clean up --------------------------------------------------------------
  # Ensure consistency
  df <- df[order(df$mid_ma, decreasing = TRUE), ]
  df$bin <- seq_len(nrow(df))
  row.names(df) <- NULL
  return(df)
}