Nothing
#' Create lagged versions of time series variables
#'
#' @description Takes a multivariate time series and creates time-lagged
#' columns for modeling. This generates one new column per lag and variable,
#' enabling analysis of how past values influence current observations.
#'
#'
#' @param input.data a dataframe with one time series per column. Default: \code{NULL}.
#' @param response character string, name of the numeric column to be used as response in the model. Default: \code{NULL}.
#' @param drivers character vector, names of the numeric columns to be used as predictors in the model. Default: \code{NULL}.
#' @param time character vector, name of the numeric column with the time. Default: \code{NULL}.
#' @param oldest.sample character string, either "first" or "last". When "first", the first row taken as the oldest case of the time series and the last row is taken as the newest case, so ecological memory flows from the first to the last row of \code{input.data}. When "last", the last row is taken as the oldest sample, and this is the mode that should be used when \code{input.data} represents a palaeoecological dataset. Default: \code{"first"}.
#' @param lags numeric vector, lags to be used in the equation, in the same units as \code{time}. The use of \code{\link{seq}} to define it is highly recommended. If 0 is absent from lags, it is added automatically to allow the consideration of a concurrent effect. Lags should be aligned to the temporal resolution of the data. For example, if the interval between consecutive samples is 100 years, lags should be something like \code{0, 100, 200, 300}. Lags can also be multiples of the time resolution, such as \code{0, 200, 400, 600} (when time resolution is 100 years). Default: \code{NULL}.
#' @param time.zoom numeric vector of two values from the range of the \code{time} column, used to subset the data if desired. Default: \code{NULL}.
#' @param scale boolean, if TRUE, applies the \code{\link{scale}} function to normalize the data. Required if the lagged data is going to be used to fit linear models. Default: \code{FALSE}.
#'
#' @details The function interprets the \code{time} column as an index representing the temporal position of each sample. It uses the \code{lag} function from the \pkg{zoo} package to shift columns by the specified lags, generating one new column per lag and variable.
#'
#' @author Blas M. Benito <blasbenito@gmail.com>
#'
#' @return A dataframe with columns representing time-delayed values of the drivers and the response. Column names have the lag number as a suffix. Has the attributes `response` and `drivers`, later used by [computeMemory()].
#'
#' @seealso \code{\link{computeMemory}}
#'
#' @examples
#'#loading data
#'data(palaeodata)
#'
#'#adding lags
#'lagged.data <- lagTimeSeries(
#' input.data = palaeodata,
#' response = "pollen.pinus",
#' drivers = c("climate.temperatureAverage", "climate.rainfallAverage"),
#' time = "age",
#' oldest.sample = "last",
#' lags = seq(0.2, 1, by=0.2)
#')
#'
#'str(lagged.data)
#'
#'# Check attributes (used by computeMemory)
#'attributes(lagged.data)
#' @family data_preparation
#' @export
lagTimeSeries <- function(
input.data = NULL,
response = NULL,
drivers = NULL,
time = NULL,
oldest.sample = "first",
lags = NULL,
time.zoom = NULL,
scale = FALSE
) {
#testing input data
if (!inherits(input.data, "data.frame")) {
stop("Argument input.data must be a dataframe.")
}
if (!is.character(response)) {
stop("Argument response must be a character string.")
} else {
if (!(response %in% colnames(input.data))) {
stop("The response column does not exist in input.data.")
}
}
if (!is.character(drivers)) {
stop("Argument drivers must be a character string or character vector.")
} else {
for (driver in drivers) {
if (!(driver %in% colnames(input.data))) {
stop(paste(
"The driver ",
driver,
" column does not exist in input.data.",
sep = ""
))
}
}
}
if (!is.character(time)) {
stop("Argument time must be a character string.")
} else {
if (!(time %in% colnames(input.data))) {
stop("The time column do not exist in input.data.")
}
}
if (
!(oldest.sample %in% c("first", "FIRST", "First", "last", "LAST", "Last"))
) {
oldest.sample <- "first"
message(
"Argument oldest.sample was not defined, I am setting it up to 'first'. Check the help file for more details."
)
}
if (!is.null(time.zoom)) {
if (max(time.zoom) > max(input.data[, time])) {
stop(
"Maximum of time.zoom should be lower or equal than the maximum of the time/age column."
)
}
if (min(time.zoom) < min(input.data[, time])) {
stop(
"Minimum of time.zoom should be higher or equal than the minimum of the time/age column."
)
}
}
#testing if lags are regular
diff.lags <- diff(lags)
if (sd(diff.lags) < 0e-10) {
stop("Argument 'lags' must be a regular sequence.")
}
diff.time <- diff(input.data[[time]])
if (sd(diff.time) < 0e-10) {
stop(
"Column '",
time,
"' of argument 'input.data' is not a regular sequence."
)
}
if (max(diff.lags) < min(diff.time)) {
lags <- seq(
from = min(lags),
to = max(lags),
by = min(diff.time)
)
message(
"Lags interval is smaller than the time interval in 'input.data', resetting 'lags' to: ",
paste(lags, collapse = ", "),
"."
)
}
#computing data resolution to adjust lags for the annual resolution dataset
temporal.resolution <- mean(diff.time, na.rm = TRUE)
#converting lags from years to cases to be used as lags
lags.to.rows <- round(lags / temporal.resolution, 0)
#testing lags.to.rows
if (length(unique(lags.to.rows)) != length(lags.to.rows)) {
stop(
"Lags must be in the units of the column '",
time,
"' and it's values be multiples of the time resolution (i.e. if time resolution is 100, valid lags are 0, 100, 200, 300, etc)"
)
}
#adds 0 to lags if it's not
if (!(0 %in% lags)) {
lags.to.rows <- c(0, lags.to.rows)
lags <- c(0, lags)
}
#if the first sample is the oldest one, lags have to be negative
if (
oldest.sample == "first" ||
oldest.sample == "First" ||
oldest.sample == "FIRST"
) {
lags.to.rows <- -lags.to.rows
}
#if the last sample is the oldest one, lags have to be positive
if (
oldest.sample == "last" ||
oldest.sample == "Last" ||
oldest.sample == "LAST"
) {
lags.to.rows <- abs(lags.to.rows)
}
#apply time.zoom if so
if (
!is.null(time.zoom) &&
is.vector(time.zoom) &&
is.numeric(time.zoom) &&
length(time.zoom) == 2
) {
input.data <- input.data[
input.data[, time] >= time.zoom[1] & input.data[, time] <= time.zoom[2],
]
}
#computing lags of the response
response.lags <- do.call(
"merge",
lapply(lags.to.rows, function(lag.to.row) {
lag(zoo::as.zoo(input.data[, response]), lag.to.row)
})
)
#naming columns
colnames(response.lags) <- paste(response, lags, sep = "__")
#driver lags
for (driver in drivers) {
driver.lags <- do.call(
"merge",
lapply(lags.to.rows, function(lag.to.row) {
lag(zoo::as.zoo(input.data[, driver]), lag.to.row)
})
)
#naming columns
colnames(driver.lags) <- paste(driver, lags, sep = "__")
#joining with response lags
response.lags <- cbind(response.lags, driver.lags)
}
#removing NA
response.lags <- as.data.frame(response.lags)
response.lags$time <- input.data[, time]
response.lags <- na.omit(response.lags)
time <- response.lags$time
response.lags$time <- NULL
#scaling data
if (scale) {
response.lags <- data.frame(scale(response.lags), time)
} else {
response.lags <- data.frame(response.lags, time)
}
attr(x = response.lags, which = "response") <- response
attr(x = response.lags, which = "drivers") <- drivers
return(response.lags)
}
#' @rdname lagTimeSeries
#' @export
prepareLaggedData <- function(
input.data = NULL,
response = NULL,
drivers = NULL,
time = NULL,
oldest.sample = "first",
lags = NULL,
time.zoom = NULL,
scale = FALSE
) {
.Deprecated("lagTimeSeries")
lagTimeSeries(
input.data = input.data,
response = response,
drivers = drivers,
time = time,
oldest.sample = oldest.sample,
lags = lags,
time.zoom = time.zoom,
scale = scale
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.