R/make_split.R
In tscv: Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation

Documented in make_split split_index

#' @title Initialize a plan for train-test split
#'
#' @description
#' Internal helper used by \code{make_split()} to determine the initial training
#' window size for each time series.
#'
#' @param main_frame A \code{tibble} containing the time series data.
#' @param context A named \code{list} with the identifiers for
#'   \code{series_id}, \code{value_id}, and \code{index_id}.
#' @param type The type for the initial split. Possible values are
#'   \code{"first"}, \code{"last"}, and \code{"prob"}.
#' @param value Numeric value specifying the split.
#'
#' @return
#' A \code{tibble} with one row per time series and columns for the total number
#' of observations and the initial training window size.
#'
#' @noRd

initialize_split <- function(main_frame,
                             context,
                             type = c("first", "last", "prob"),
                             value = NULL) {

  series_id <- context[["series_id"]]

  data <- main_frame |>
    select(all_of(series_id)) |>
    group_by(!!sym(series_id)) |>
    summarise(n_total = n(), .groups = "drop")

  # Case 1: Use the first value observations for training
  if (type == "first") {
    data <- data |>
      mutate(n_init = value)
  }

  # Case 2: Use the last value observations for testing
  if (type == "last") {
    data <- data |>
      mutate(n_init = .data$n_total - value - 1)
  }

  # Case 3: Use value pct. of observations for training
  if (type == "prob") {
    data <- data |>
      mutate(n_init = floor(value * .data$n_total))
  }

  return(data)
}



#' @title Create indices for train and test splits
#'
#' @description
#' Create train and test indices for time series cross-validation.
#'
#' @details
#' \code{split_index()} creates integer index vectors for rolling-origin
#' resampling. The function can create either fixed-window or expanding-window
#' splits:
#' \itemize{
#'   \item \code{mode = "slide"} creates a fixed training window that moves
#'   forward over time.
#'   \item \code{mode = "stretch"} creates an expanding training window that
#'   always starts at the first observation.
#' }
#'
#' The first training window contains \code{n_init} observations. Each test
#' window contains \code{n_ahead} observations. The argument \code{n_skip}
#' controls how many observations are skipped between consecutive split origins.
#' For example, with \code{n_ahead = 18} and \code{n_skip = 17}, consecutive
#' test windows are non-overlapping.
#'
#' If \code{n_lag > 0}, the test indices include lagged observations before the
#' forecast horizon. This is useful when lagged predictors are needed for
#' constructing features during testing.
#'
#' If \code{exceed = TRUE}, additional out-of-sample test indices are allowed to
#' exceed the original sample size.
#'
#' @param n_total Integer. The total number of observations in the time series.
#' @param n_init Integer. The number of observations in the initial training
#'   window.
#' @param n_ahead Integer. The forecast horizon, i.e. the number of observations
#'   in each test window.
#' @param n_skip Integer. The number of observations to skip between split
#'   origins. The default is \code{0}.
#' @param n_lag Integer. The number of lagged observations to include before the
#'   test window. The default is \code{0}.
#' @param mode Character value. Either \code{"slide"} for a fixed-window
#'   approach or \code{"stretch"} for an expanding-window approach.
#' @param exceed Logical value. If \code{TRUE}, test indices may exceed the
#'   original sample size.
#'
#' @return
#' A \code{list} with two elements:
#' \itemize{
#'   \item \code{train}: a list of integer vectors with training indices.
#'   \item \code{test}: a list of integer vectors with test indices.
#' }
#'
#' @family time series cross-validation
#' @export
#'
#' @examples
#' # Fixed-window splits
#' fixed_index <- split_index(
#'   n_total = 180,
#'   n_init = 120,
#'   n_ahead = 18,
#'   n_skip = 17,
#'   n_lag = 0,
#'   mode = "slide",
#'   exceed = FALSE
#' )
#'
#' fixed_index
#'
#' # Expanding-window splits
#' expanding_index <- split_index(
#'   n_total = 180,
#'   n_init = 120,
#'   n_ahead = 18,
#'   n_skip = 17,
#'   n_lag = 0,
#'   mode = "stretch",
#'   exceed = FALSE
#' )
#'
#' expanding_index

split_index <- function(n_total,
                        n_init,
                        n_ahead,
                        n_skip = 0,
                        n_lag = 0,
                        mode = "slide",
                        exceed = FALSE) {

  if (exceed) {
    n_total <- n_total + n_ahead
  }

  if (n_total < n_init + n_ahead) {
    stop(
      "There should be at least ",
      n_init + n_ahead,
      " observations in `data`",
      call. = FALSE
    )
  }

  if (!is.numeric(n_lag) | !(n_lag %% 1 == 0)) {
    stop("`n_lag` must be a whole number.", call. = FALSE)
  }

  if (n_lag > n_init) {
    stop(
      "`n_lag` must be less than or equal to the number of training observations.",
      call. = FALSE
    )
  }

  stops <- seq(n_init, (n_total - n_ahead), by = n_skip + 1)

  starts <- if (mode == "slide") {
    stops - n_init + 1
  } else {
    rep(1, length(stops))
  }

  # Prepare index vectors for training and testing as list
  train <- map2(.x = starts, .y = stops, ~seq(.x, .y))
  test <- map2(.x = stops + 1 - n_lag, .y = stops + n_ahead, ~seq(.x, .y))

  index <- list(
    train = train,
    test = test
  )

  return(index)
}



#' @title Expand the split frame
#'
#' @description
#' Internal helper used by \code{make_split()} to expand nested train and test
#' index lists into one row per time series and split.
#'
#' @param split_frame A \code{tibble} containing nested train and test indices.
#' @param context A named \code{list} with the identifiers for
#'   \code{series_id}, \code{value_id}, and \code{index_id}.
#'
#' @return
#' A \code{tibble} containing one row per time series and split.
#'
#' @noRd

expand_split <- function(split_frame,
                         context) {

  series_id <- context[["series_id"]]

  split_frame <- split_frame |>
    select(
      all_of(c(series_id, "n_splits", "train", "test"))
    ) |>
    group_by(!!sym(series_id)) |>
    mutate(split = list(seq_len(.data$n_splits))) |>
    ungroup() |>
    unnest(cols = all_of(c("split", "train", "test"))) |>
    select(
      all_of(c(series_id, "split", "train", "test"))
    )

  return(split_frame)
}



#' @title Create train-test splits for time series cross-validation
#'
#' @description
#' Create a split frame with train and test indices for one or more time series.
#'
#' @details
#' \code{make_split()} creates rolling-origin train-test splits for time series
#' cross-validation. The output is used by functions such as
#' \code{slice_train()} and \code{slice_test()} to extract the corresponding
#' training and testing samples from \code{main_frame}.
#'
#' The function supports two training-window modes:
#' \itemize{
#'   \item \code{mode = "slide"} creates a fixed-window approach. The training
#'   window has constant length and moves forward over time.
#'   \item \code{mode = "stretch"} creates an expanding-window approach. The
#'   training window starts at the first observation and grows over time.
#' }
#'
#' The initial training window is controlled by \code{type} and \code{value}:
#' \itemize{
#'   \item \code{type = "first"} uses the first \code{value} observations as the
#'   initial training window.
#'   \item \code{type = "last"} keeps the last \code{value} observations for
#'   testing and derives the initial training window from the remaining sample.
#'   \item \code{type = "prob"} uses \code{floor(value * n_total)} observations
#'   as the initial training window.
#' }
#'
#' The argument \code{n_skip} controls how far the rolling origin moves between
#' consecutive splits. For non-overlapping test windows, use
#' \code{n_skip = n_ahead - 1}.
#'
#' @param main_frame A \code{tibble} containing the time series data.
#' @param context A named \code{list} with the identifiers for
#'   \code{series_id}, \code{value_id}, and \code{index_id}.
#' @param type Character value. The type of initial split. Possible values are
#'   \code{"first"}, \code{"last"}, and \code{"prob"}.
#' @param value Numeric value specifying the initial split.
#' @param n_ahead Integer. The forecast horizon, i.e. the number of observations
#'   in each test window.
#' @param n_skip Integer. The number of observations to skip between split
#'   origins. The default is \code{0}.
#' @param n_lag Integer. The number of lagged observations to include before the
#'   test window. This is useful if lagged predictors are required when
#'   constructing test features. The default is \code{0}.
#' @param mode Character value. Either \code{"slide"} for a fixed-window
#'   approach or \code{"stretch"} for an expanding-window approach.
#' @param exceed Logical value. If \code{TRUE}, out-of-sample splits exceeding
#'   the original sample size are created.
#'
#' @return
#' A \code{tibble} containing the split plan. The output has one row per time
#' series and split, with list-columns \code{train} and \code{test} containing
#' integer row positions.
#'
#' @family time series cross-validation
#' @export
#'
#' @examples
#' library(dplyr)
#'
#' context <- list(
#'   series_id = "series",
#'   value_id = "value",
#'   index_id = "index"
#' )
#'
#' main_frame <- M4_monthly_data |>
#'   filter(series == "M23100")
#'
#' # Fixed-window split plan
#' fixed_split <- make_split(
#'   main_frame = main_frame,
#'   context = context,
#'   type = "first",
#'   value = 120,
#'   n_ahead = 18,
#'   n_skip = 17,
#'   n_lag = 0,
#'   mode = "slide",
#'   exceed = FALSE
#' )
#'
#' fixed_split
#'
#' # Expanding-window split plan
#' expanding_split <- make_split(
#'   main_frame = main_frame,
#'   context = context,
#'   type = "first",
#'   value = 120,
#'   n_ahead = 18,
#'   n_skip = 17,
#'   n_lag = 0,
#'   mode = "stretch",
#'   exceed = FALSE
#' )
#'
#' expanding_split

make_split <- function(main_frame,
                       context,
                       type,
                       value,
                       n_ahead,
                       n_skip = 0,
                       n_lag = 0,
                       mode = "slide",
                       exceed = TRUE) {

  # Create initial split
  split_frame <- initialize_split(
    main_frame = main_frame,
    context = context,
    type = type,
    value = value
  )

  # Create indices for train and test data and add as nested list
  split_frame <- map_dfr(
    .x = seq_len(nrow(split_frame)),
    .f = ~{
      # Create indices for training and testing
      index <- split_index(
        n_total = split_frame$n_total[.x],
        n_init = split_frame$n_init[.x],
        n_ahead = n_ahead,
        n_skip = n_skip,
        n_lag = n_lag,
        mode = mode,
        exceed = exceed
      )

      # Add indices to split_frame
      split_frame |>
        slice(.x) |>
        mutate(n_ahead = n_ahead) |>
        mutate(n_splits = length(index$train)) |>
        mutate(train = list(index$train)) |>
        mutate(test = list(index$test))
    }
  )

  # Expand split_frame and return output
  split_frame <- expand_split(
    split_frame = split_frame,
    context = context
  )

  return(split_frame)
}

Any scripts or data that you put into this service are public.

tscv documentation built on May 13, 2026, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tscv
Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation

R/make_split.R
In tscv: Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation

Defines functions make_split expand_split split_index initialize_split

Documented in make_split split_index

Try the tscv package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tscv Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation

R/make_split.R In tscv: Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation

Defines functions make_split expand_split split_index initialize_split

Documented in make_split split_index

Try the tscv package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tscv
Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation

R/make_split.R
In tscv: Functions and Utilities for Tidy Time Series Forecasting and Time Series Cross-Validation