Nothing
#' Add Lagged Columns via Join
#'
#' Appends lagged versions of specified columns to a data frame using a
#' join-based approach.
#'
#' When `lag == max_lag` (the default), an equi-join is used: source dates
#' are shifted forward by `lag` and matched exactly. When `lag < max_lag`,
#' an inequality join is used: for each row, the most recent source value
#' within the window `[date - max_lag, date - lag]` is selected.
#'
#' The combination of `by` and date columns must be unique in `data`. If `by`
#' is `NULL`, dates alone must be unique.
#'
#' @param data A data frame containing the variables to lag.
#' @param cols A character vector specifying the names of the columns to be
#' lagged. Each column produces a new column suffixed with `_lag`.
#' @param lag An integer or a `lubridate::periods()` object, e.g.,
#' `months(1)`, specifying the minimum lag (inclusive) to apply.
#' @param max_lag An integer or a `lubridate::periods()` object specifying
#' the maximum lag (inclusive) to apply. Defaults to `lag` (exact lag).
#' @param by An optional character vector specifying grouping columns
#' (e.g., a stock identifier). Lagged values are matched within groups.
#' Defaults to `NULL`.
#' @param drop_na A logical value. If `TRUE`, `NA` values in the source
#' columns are excluded before matching, so the lookup skips over missing
#' observations. Applied independently per column. Defaults to `FALSE`.
#' @param ff_adjustment A logical value. If `TRUE`, only the last observation
#' per year (within each group defined by `by`) is retained as a source for
#' lagged values, following Fama-French conventions for annual accounting
#' data. Defaults to `FALSE`.
#' @param data_options A list of class `tidyfinance_data_options` (created via
#' [data_options()]) specifying column name mappings. The `date` element is
#' used to specify the date column. Uses [data_options()] default if `NULL`:
#' `"date" = "date"`.
#'
#' @returns A data frame with the same rows as `data` and new columns appended,
#' each suffixed with `_lag`. Unmatched rows receive `NA` in the lagged
#' columns.
#'
#' @family rolling and lagging functions
#' @export
#'
#' @examples
#' set.seed(42)
#' data <- tibble::tibble(
#' permno = rep(1:2, each = 10),
#' date = rep(
#' seq.Date(as.Date("2023-01-01"), by = "month", length.out = 10),
#' 2
#' ),
#' size = runif(20, 100, 200),
#' bm = runif(20, 0.5, 1.5)
#' )
#'
#' # Exact lag: each row gets the value from exactly 2 months earlier
#' add_lagged_columns(
#' data,
#' cols = c("size", "bm"),
#' lag = months(2),
#' by = "permno"
#' )
#'
#' # Window lag: each row gets the most recent value from 2 to 4 months earlier
#' add_lagged_columns(
#' data,
#' cols = "size",
#' lag = months(2),
#' max_lag = months(4),
#' by = "permno"
#' )
#'
add_lagged_columns <- function(
data,
cols,
lag,
max_lag = lag,
by = NULL,
drop_na = FALSE,
ff_adjustment = FALSE,
data_options = NULL
) {
if (is.null(data_options)) {
data_options <- data_options()
}
date_col <- data_options$date
if (!date_col %in% names(data)) {
cli::cli_abort(
"{.arg data} must contain the date column {.field {date_col}}."
)
}
if (lag < 0 || max_lag < lag) {
cli::cli_abort(
paste(
"{.arg lag} and {.arg max_lag} must be non-negative",
"and {.arg max_lag} must be >= {.arg lag}."
)
)
}
missing_cols <- setdiff(cols, names(data))
if (length(missing_cols) > 0) {
cli::cli_abort(
"{.arg data} is missing column{?s}: {.field {missing_cols}}."
)
}
if (!is.null(by)) {
missing_by <- setdiff(by, names(data))
if (length(missing_by) > 0) {
cli::cli_abort(
"{.arg data} is missing grouping column{?s}: {.field {missing_by}}."
)
}
}
join_cols <- c(by, date_col)
if (anyDuplicated(data[join_cols])) {
cli::cli_abort(
paste(
"The combination of {.arg by}",
"and date columns must be unique in {.arg data}."
)
)
}
exact_lag <- (lag == max_lag)
if (!exact_lag) {
data <- data |>
check_new_col(c(".upper", ".lower")) |>
dplyr::mutate(
.upper = .data[[date_col]] - lag,
.lower = .data[[date_col]] - max_lag
)
}
for (col in cols) {
lagged <- data[c(join_cols, col)]
if (drop_na) {
lagged <- lagged[!is.na(lagged[[col]]), ]
}
if (ff_adjustment) {
yr <- lubridate::year(lagged[[date_col]])
grp <- if (!is.null(by)) interaction(lagged[by], yr) else yr
max_dates <- stats::ave(as.numeric(lagged[[date_col]]), grp, FUN = max)
lagged <- lagged[as.numeric(lagged[[date_col]]) == max_dates, ]
}
lag_col_name <- paste0(col, "_lag")
if (exact_lag) {
lagged[[date_col]] <- lagged[[date_col]] + lag
names(lagged)[names(lagged) == col] <- lag_col_name
data <- data |>
check_new_col(lag_col_name) |>
dplyr::left_join(lagged, by = join_cols)
} else {
names(lagged)[names(lagged) == date_col] <- ".src_date"
names(lagged)[names(lagged) == col] <- lag_col_name
data <- data |>
check_new_col(lag_col_name) |>
dplyr::left_join(
lagged,
by = dplyr::join_by(!!!rlang::syms(by), closest(.upper >= .src_date))
) |>
dplyr::mutate(
!!lag_col_name := dplyr::if_else(
!is.na(.data[[".src_date"]]) &
.data[[".src_date"]] >= .data[[".lower"]],
.data[[lag_col_name]],
.data[[lag_col_name]][NA_integer_]
)
) |>
dplyr::select(-".src_date")
}
}
if (!exact_lag) {
data[[".upper"]] <- NULL
data[[".lower"]] <- NULL
}
data
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.