R/lagTimeSeries.R

Defines functions prepareLaggedData lagTimeSeries

Documented in lagTimeSeries prepareLaggedData

#' Create lagged versions of time series variables
#'
#' @description Takes a multivariate time series and creates time-lagged
#' columns for modeling. This generates one new column per lag and variable,
#' enabling analysis of how past values influence current observations.
#'
#'
#' @param input.data a dataframe with one time series per column. Default: \code{NULL}.
#' @param response character string, name of the numeric column to be used as response in the model. Default: \code{NULL}.
#' @param drivers  character vector, names of the numeric columns to be used as predictors in the model. Default: \code{NULL}.
#' @param time character vector, name of the numeric column with the time. Default: \code{NULL}.
#' @param oldest.sample character string, either "first" or "last". When "first", the first row taken as the oldest case of the time series and the last row is taken as the newest case, so ecological memory flows from the first to the last row of \code{input.data}. When "last", the last row is taken as the oldest sample, and this is the mode that should be used when \code{input.data} represents a palaeoecological dataset. Default: \code{"first"}.
#' @param lags numeric vector, lags to be used in the equation, in the same units as \code{time}. The use of \code{\link{seq}} to define it is highly recommended. If 0 is absent from lags, it is added automatically to allow the consideration of a concurrent effect. Lags should be aligned to the temporal resolution of the data. For example, if the interval between consecutive samples is 100 years, lags should be something like \code{0, 100, 200, 300}. Lags can also be multiples of the time resolution, such as \code{0, 200, 400, 600} (when time resolution is 100 years). Default: \code{NULL}.
#' @param time.zoom numeric vector of two values from the range of the \code{time} column, used to subset the data if desired. Default: \code{NULL}.
#' @param scale boolean, if TRUE, applies the \code{\link{scale}} function to normalize the data. Required if the lagged data is going to be used to fit linear models. Default: \code{FALSE}.
#'
#' @details The function interprets the \code{time} column as an index representing the temporal position of each sample. It uses the \code{lag} function from the \pkg{zoo} package to shift columns by the specified lags, generating one new column per lag and variable.
#'
#' @author Blas M. Benito  <blasbenito@gmail.com>
#'
#' @return A dataframe with columns representing time-delayed values of the drivers and the response. Column names have the lag number as a suffix. Has the attributes `response` and `drivers`, later used by [computeMemory()].
#'
#' @seealso \code{\link{computeMemory}}
#'
#' @examples
#'#loading data
#'data(palaeodata)
#'
#'#adding lags
#'lagged.data <- lagTimeSeries(
#'  input.data = palaeodata,
#'  response = "pollen.pinus",
#'  drivers = c("climate.temperatureAverage", "climate.rainfallAverage"),
#'  time = "age",
#'  oldest.sample = "last",
#'  lags = seq(0.2, 1, by=0.2)
#')
#'
#'str(lagged.data)
#'
#'# Check attributes (used by computeMemory)
#'attributes(lagged.data)
#' @family data_preparation
#' @export
lagTimeSeries <- function(
  input.data = NULL,
  response = NULL,
  drivers = NULL,
  time = NULL,
  oldest.sample = "first",
  lags = NULL,
  time.zoom = NULL,
  scale = FALSE
) {
  #testing input data
  if (!inherits(input.data, "data.frame")) {
    stop("Argument input.data must be a dataframe.")
  }
  if (!is.character(response)) {
    stop("Argument response must be a character string.")
  } else {
    if (!(response %in% colnames(input.data))) {
      stop("The response column does not exist in input.data.")
    }
  }

  if (!is.character(drivers)) {
    stop("Argument drivers must be a character string or character vector.")
  } else {
    for (driver in drivers) {
      if (!(driver %in% colnames(input.data))) {
        stop(paste(
          "The driver ",
          driver,
          " column does not exist in input.data.",
          sep = ""
        ))
      }
    }
  }

  if (!is.character(time)) {
    stop("Argument time must be a character string.")
  } else {
    if (!(time %in% colnames(input.data))) {
      stop("The time column do not exist in input.data.")
    }
  }

  if (
    !(oldest.sample %in% c("first", "FIRST", "First", "last", "LAST", "Last"))
  ) {
    oldest.sample <- "first"
    message(
      "Argument oldest.sample was not defined, I am setting it up to 'first'. Check the help file for more details."
    )
  }

  if (!is.null(time.zoom)) {
    if (max(time.zoom) > max(input.data[, time])) {
      stop(
        "Maximum of time.zoom should be lower or equal than the maximum of the time/age column."
      )
    }
    if (min(time.zoom) < min(input.data[, time])) {
      stop(
        "Minimum of time.zoom should be higher or equal than the minimum of the time/age column."
      )
    }
  }

  #testing if lags are regular
  diff.lags <- diff(lags)
  if (sd(diff.lags) < 0e-10) {
    stop("Argument 'lags' must be a regular sequence.")
  }

  diff.time <- diff(input.data[[time]])
  if (sd(diff.time) < 0e-10) {
    stop(
      "Column '",
      time,
      "' of argument 'input.data' is not a regular sequence."
    )
  }

  if (max(diff.lags) < min(diff.time)) {
    lags <- seq(
      from = min(lags),
      to = max(lags),
      by = min(diff.time)
    )

    message(
      "Lags interval is smaller than the time interval in 'input.data', resetting 'lags' to: ",
      paste(lags, collapse = ", "),
      "."
    )
  }

  #computing data resolution to adjust lags for the annual resolution dataset
  temporal.resolution <- mean(diff.time, na.rm = TRUE)

  #converting lags from years to cases to be used as lags
  lags.to.rows <- round(lags / temporal.resolution, 0)

  #testing lags.to.rows
  if (length(unique(lags.to.rows)) != length(lags.to.rows)) {
    stop(
      "Lags must be in the units of the column '",
      time,
      "' and it's values be multiples of the time resolution (i.e. if time resolution is 100, valid lags are 0, 100, 200, 300, etc)"
    )
  }

  #adds 0 to lags if it's not
  if (!(0 %in% lags)) {
    lags.to.rows <- c(0, lags.to.rows)
    lags <- c(0, lags)
  }

  #if the first sample is the oldest one, lags have to be negative
  if (
    oldest.sample == "first" ||
      oldest.sample == "First" ||
      oldest.sample == "FIRST"
  ) {
    lags.to.rows <- -lags.to.rows
  }

  #if the last sample is the oldest one, lags have to be positive
  if (
    oldest.sample == "last" ||
      oldest.sample == "Last" ||
      oldest.sample == "LAST"
  ) {
    lags.to.rows <- abs(lags.to.rows)
  }

  #apply time.zoom if so
  if (
    !is.null(time.zoom) &&
      is.vector(time.zoom) &&
      is.numeric(time.zoom) &&
      length(time.zoom) == 2
  ) {
    input.data <- input.data[
      input.data[, time] >= time.zoom[1] & input.data[, time] <= time.zoom[2],
    ]
  }

  #computing lags of the response
  response.lags <- do.call(
    "merge",
    lapply(lags.to.rows, function(lag.to.row) {
      lag(zoo::as.zoo(input.data[, response]), lag.to.row)
    })
  )

  #naming columns
  colnames(response.lags) <- paste(response, lags, sep = "__")

  #driver lags
  for (driver in drivers) {
    driver.lags <- do.call(
      "merge",
      lapply(lags.to.rows, function(lag.to.row) {
        lag(zoo::as.zoo(input.data[, driver]), lag.to.row)
      })
    )

    #naming columns
    colnames(driver.lags) <- paste(driver, lags, sep = "__")

    #joining with response lags
    response.lags <- cbind(response.lags, driver.lags)
  }

  #removing NA
  response.lags <- as.data.frame(response.lags)
  response.lags$time <- input.data[, time]
  response.lags <- na.omit(response.lags)
  time <- response.lags$time
  response.lags$time <- NULL

  #scaling data
  if (scale) {
    response.lags <- data.frame(scale(response.lags), time)
  } else {
    response.lags <- data.frame(response.lags, time)
  }

  attr(x = response.lags, which = "response") <- response
  attr(x = response.lags, which = "drivers") <- drivers

  return(response.lags)
}


#' @rdname lagTimeSeries
#' @export
prepareLaggedData <- function(
  input.data = NULL,
  response = NULL,
  drivers = NULL,
  time = NULL,
  oldest.sample = "first",
  lags = NULL,
  time.zoom = NULL,
  scale = FALSE
) {
  .Deprecated("lagTimeSeries")
  lagTimeSeries(
    input.data = input.data,
    response = response,
    drivers = drivers,
    time = time,
    oldest.sample = oldest.sample,
    lags = lags,
    time.zoom = time.zoom,
    scale = scale
  )
}

Try the memoria package in your browser

Any scripts or data that you put into this service are public.

memoria documentation built on Feb. 10, 2026, 9:07 a.m.