R/borg_cv.R

Defines functions convert_to_mlr3 convert_to_caret convert_to_rsample generate_temporal_sliding_folds generate_temporal_expanding_folds generate_temporal_group_folds generate_spatial_group_folds generate_spatial_temporal_folds generate_group_folds generate_temporal_block_folds generate_spatial_block_folds generate_random_folds print.borg_cv borg_cv

Documented in borg_cv

# BORG CV Generator
# Generates valid cross-validation schemes based on data dependency structure

#' Generate Valid Cross-Validation Scheme
#'
#' Creates cross-validation folds that respect data dependency structure.
#' When spatial, temporal, or clustered dependencies are detected, random
#' CV is disabled and appropriate blocking strategies are enforced.
#'
#' @param data A data frame to create CV folds for.
#' @param diagnosis A \code{\link{BorgDiagnosis}} object from \code{\link{borg_diagnose}}.
#'   If NULL, diagnosis is performed automatically.
#' @param v Integer. Number of folds. Default: 5.
#' @param coords Character vector of length 2 specifying coordinate column names.
#'   Required for spatial blocking if diagnosis is NULL.
#' @param time Character string specifying the time column name.
#'   Required for temporal blocking if diagnosis is NULL.
#' @param groups Character string specifying the grouping column name.
#'   Required for group CV if diagnosis is NULL.
#' @param target Character string specifying the response variable column name.
#' @param block_size Numeric. For spatial blocking, the minimum block size.
#'   If NULL, automatically determined from diagnosis. Should be larger than
#'   the autocorrelation range.
#' @param embargo Integer. For temporal blocking, minimum gap between train and test.
#'   If NULL, automatically determined from diagnosis.
#' @param strategy Character. Override the auto-detected CV strategy. Use
#'   \code{"temporal_expanding"} for expanding window (forward-chaining) or
#'   \code{"temporal_sliding"} for fixed-size sliding window.
#'   Default: NULL (auto-detect from diagnosis).
#' @param output Character. Output format: "list" (default), "rsample", "caret", "mlr3".
#' @param allow_random Logical. If TRUE, allows random CV even when dependencies detected.
#'   Default: FALSE. Setting to TRUE requires explicit acknowledgment.
#' @param verbose Logical. If TRUE, print diagnostic messages. Default: FALSE.
#'
#' @return Depending on \code{output}:
#' \describe{
#'   \item{"list"}{A list with elements: \code{folds} (list of train/test index vectors),
#'     \code{diagnosis} (the BorgDiagnosis used), \code{strategy} (CV strategy name),
#'     \code{params} (parameters used).}
#'   \item{"rsample"}{An \code{rsample} \code{rset} object compatible with tidymodels.}
#'   \item{"caret"}{A \code{trainControl} object for caret.}
#'   \item{"mlr3"}{An \code{mlr3} \code{Resampling} object.}
#' }
#'
#' @details
#' \subsection{The Enforcement Principle}{
#' Unlike traditional CV helpers, \code{borg_cv} enforces valid evaluation:
#' \itemize{
#'   \item If spatial autocorrelation is detected, \strong{random CV is disabled}
#'   \item If temporal autocorrelation is detected, \strong{random CV is disabled}
#'   \item If clustered structure is detected, \strong{random CV is disabled}
#'   \item To use random CV on dependent data, you must set \code{allow_random = TRUE}
#'         and provide justification (this is logged).
#' }
#' }
#'
#' \subsection{Spatial Blocking}{
#' When spatial dependencies are detected, data are partitioned into spatial blocks
#' using k-means clustering on coordinates. Block size is set to exceed the
#' estimated autocorrelation range. This ensures train and test sets are
#' spatially separated.
#' }
#'
#' \subsection{Temporal Blocking}{
#' When temporal dependencies are detected, data are split chronologically with
#' an embargo period between train and test sets. This prevents information from
#' future observations leaking into training.
#' }
#'
#' \subsection{Group CV}{
#' When clustered structure is detected, entire groups (clusters) are held out
#' together. No group appears in both train and test within a fold.
#' }
#'
#' @examples
#' # Spatial data with autocorrelation
#' set.seed(42)
#' spatial_data <- data.frame(
#'   x = runif(200, 0, 100),
#'   y = runif(200, 0, 100),
#'   response = rnorm(200)
#' )
#'
#' # Diagnose and create CV
#' cv <- borg_cv(spatial_data, coords = c("x", "y"), target = "response")
#' str(cv$folds)  # List of train/test indices
#'
#' # Clustered data
#' clustered_data <- data.frame(
#'   site = rep(1:20, each = 10),
#'   value = rep(rnorm(20, sd = 2), each = 10) + rnorm(200, sd = 0.5)
#' )
#'
#' cv <- borg_cv(clustered_data, groups = "site", target = "value")
#' cv$strategy  # "group_fold"
#'
#' # Get rsample-compatible output for tidymodels
#' \donttest{
#' cv_rsample <- borg_cv(spatial_data, coords = c("x", "y"), output = "rsample")
#' }
#'
#' @seealso \code{\link{borg_diagnose}}, \code{\link{BorgDiagnosis}}
#'
#' @export
borg_cv <- function(data,
                    diagnosis = NULL,
                    v = 5,
                    coords = NULL,
                    time = NULL,
                    groups = NULL,
                    target = NULL,
                    block_size = NULL,
                    embargo = NULL,
                    strategy = NULL,
                    output = c("list", "rsample", "caret", "mlr3"),
                    allow_random = FALSE,
                    verbose = FALSE) {

  output <- match.arg(output)

  # Input validation
  if (!is.data.frame(data)) {
    stop("'data' must be a data frame")
  }
  n <- nrow(data)
  if (n < v * 2) {
    stop(sprintf("Insufficient data: need at least %d rows for %d folds", v * 2, v))
  }

  # Run diagnosis if not provided
  if (is.null(diagnosis)) {
    if (verbose) message("Running borg_diagnose()...")
    diagnosis <- borg_diagnose(data,
                               coords = coords,
                               time = time,
                               groups = groups,
                               target = target,
                               verbose = verbose)
  }

  # Check if random CV is appropriate
  if (diagnosis@severity != "none" && !allow_random) {
    if (verbose) {
      message(sprintf("Dependency detected (%s). Using %s instead of random CV.",
                      diagnosis@dependency_type, diagnosis@recommended_cv))
    }
  } else if (diagnosis@severity != "none" && allow_random) {
    warning(sprintf(
      paste0("BORG WARNING: Random CV requested despite %s dependency.\n",
             "  Estimated metric inflation: ~%.0f%%\n",
             "  This may produce invalid performance estimates."),
      diagnosis@dependency_type,
      diagnosis@inflation_estimate$auc_inflation * 100
    ))
  }

  # Select CV strategy
  if (!is.null(strategy)) {
    valid_strategies <- c("random", "spatial_block", "temporal_block",
                          "temporal_expanding", "temporal_sliding",
                          "group_fold", "spatial_temporal", "spatial_group",
                          "temporal_group")
    if (!strategy %in% valid_strategies) {
      stop(sprintf("Unknown strategy '%s'. Must be one of: %s",
                   strategy, paste(valid_strategies, collapse = ", ")))
    }
  } else {
    strategy <- if (allow_random && diagnosis@severity != "none") {
      "random_override"
    } else {
      diagnosis@recommended_cv
    }
  }

  # Generate folds based on strategy
  folds <- switch(strategy,
    "random" = generate_random_folds(data, v),
    "random_override" = generate_random_folds(data, v),
    "spatial_block" = generate_spatial_block_folds(data, v, diagnosis, coords, block_size, verbose),
    "temporal_block" = generate_temporal_block_folds(data, v, diagnosis, time, embargo, verbose),
    "temporal_expanding" = generate_temporal_expanding_folds(data, v, diagnosis, time, embargo, verbose),
    "temporal_sliding" = generate_temporal_sliding_folds(data, v, diagnosis, time, embargo, verbose),
    "group_fold" = generate_group_folds(data, v, diagnosis, groups, verbose),
    "spatial_temporal" = generate_spatial_temporal_folds(data, v, diagnosis, coords, time, verbose),
    "spatial_group" = generate_spatial_group_folds(data, v, diagnosis, coords, groups, verbose),
    "temporal_group" = generate_temporal_group_folds(data, v, diagnosis, time, groups, verbose),
    stop(sprintf("Unknown CV strategy: %s", strategy))
  )

  # Build result
  params <- list(
    v = v,
    n = n,
    strategy = strategy,
    allow_random = allow_random
  )

  if (!is.null(block_size)) params$block_size <- block_size
  if (!is.null(embargo)) params$embargo <- embargo

  result <- list(
    folds = folds,
    diagnosis = diagnosis,
    strategy = strategy,
    params = params
  )
  class(result) <- c("borg_cv", "list")

  # Convert output format if requested
  if (output == "list") {
    return(result)
  } else if (output == "rsample") {
    return(convert_to_rsample(result, data))
  } else if (output == "caret") {
    return(convert_to_caret(result))
  } else if (output == "mlr3") {
    return(convert_to_mlr3(result, data))
  }

  result
}


#' @export
print.borg_cv <- function(x, ...) {
  cat("BORG Cross-Validation\n")
  cat("=====================\n\n")
  cat(sprintf("Strategy:    %s\n", x$strategy))
  cat(sprintf("Folds:       %d\n", length(x$folds)))
  cat(sprintf("Total obs:   %d\n", x$params$n))

  # Show fold sizes
  train_sizes <- vapply(x$folds, function(f) length(f$train), integer(1))
  test_sizes <- vapply(x$folds, function(f) length(f$test), integer(1))
  cat(sprintf("Train sizes: %d - %d (mean: %.0f)\n",
              min(train_sizes), max(train_sizes), mean(train_sizes)))
  cat(sprintf("Test sizes:  %d - %d (mean: %.0f)\n",
              min(test_sizes), max(test_sizes), mean(test_sizes)))

  # Warning if random override
  if (x$strategy == "random_override") {
    cat("\nWARNING: Random CV used despite detected dependencies.\n")
    cat(sprintf("         Estimated inflation: ~%.0f%%\n",
                x$diagnosis@inflation_estimate$auc_inflation * 100))
  }

  invisible(x)
}


# ============================================================================
# Fold generation functions
# ============================================================================

#' Generate Random Folds (Internal)
#' @noRd
generate_random_folds <- function(data, v) {
  n <- nrow(data)
  idx <- sample(n)
  fold_ids <- rep(1:v, length.out = n)

  lapply(1:v, function(i) {
    test_idx <- idx[fold_ids == i]
    train_idx <- idx[fold_ids != i]
    list(train = train_idx, test = test_idx)
  })
}


#' Generate Spatial Block Folds (Internal)
#' @noRd
generate_spatial_block_folds <- function(data, v, diagnosis, coords, block_size, verbose) {
  # Get coordinates
  if (is.null(coords)) {
    coords <- diagnosis@spatial$coords_used
  }
  if (is.null(coords)) {
    stop("Spatial blocking requires coordinates. Provide 'coords' argument.")
  }

  x_coord <- data[[coords[1]]]
  y_coord <- data[[coords[2]]]
  n <- nrow(data)

  # Determine block size
  if (is.null(block_size)) {
    if (!is.na(diagnosis@spatial$range_estimate)) {
      block_size <- diagnosis@spatial$range_estimate * 1.5  # 1.5x range for safety
    } else {
      # Fallback: divide extent by sqrt(v)
      x_range <- diff(range(x_coord, na.rm = TRUE))
      y_range <- diff(range(y_coord, na.rm = TRUE))
      block_size <- max(x_range, y_range) / sqrt(v)
    }
  }

  if (verbose) message(sprintf("Using spatial block size: %.2f", block_size))

  # Use k-means to create spatial clusters
  # Number of clusters should be >= v
  n_clusters <- max(v, ceiling(n / 20))  # At least 20 points per cluster
  n_clusters <- min(n_clusters, n %/% 3)  # But not too many

  coord_mat <- cbind(x_coord, y_coord)
  complete <- complete.cases(coord_mat)
  coord_mat_complete <- coord_mat[complete, , drop = FALSE]

  set.seed(42)  # Reproducibility
  km <- stats::kmeans(coord_mat_complete, centers = n_clusters, nstart = 10)

  # Assign clusters back (NA for incomplete)
  clusters <- rep(NA_integer_, n)
  clusters[complete] <- km$cluster

  # Assign clusters to folds (balanced by size)
  cluster_table <- table(clusters)
  cluster_ids <- as.integer(names(cluster_table))
  cluster_order <- order(cluster_table, decreasing = TRUE)

  fold_assignment <- rep(NA_integer_, length(cluster_ids))
  fold_sizes <- rep(0L, v)

  for (cid in cluster_ids[cluster_order]) {
    # Assign to fold with smallest current size
    target_fold <- which.min(fold_sizes)
    fold_assignment[cid] <- target_fold
    fold_sizes[target_fold] <- fold_sizes[target_fold] + cluster_table[as.character(cid)]
  }

  # Create folds
  obs_folds <- fold_assignment[clusters]
  obs_folds[is.na(obs_folds)] <- sample(1:v, sum(is.na(obs_folds)), replace = TRUE)

  lapply(1:v, function(i) {
    test_idx <- which(obs_folds == i)
    train_idx <- which(obs_folds != i)
    list(train = train_idx, test = test_idx)
  })
}


#' Generate Temporal Block Folds (Internal)
#' @noRd
generate_temporal_block_folds <- function(data, v, diagnosis, time_col, embargo, verbose) {
  # Get time column
  if (is.null(time_col)) {
    time_col <- diagnosis@temporal$time_col
  }
  if (is.null(time_col)) {
    stop("Temporal blocking requires time column. Provide 'time' argument.")
  }

  time_vals <- data[[time_col]]
  n <- nrow(data)

  # Convert to numeric
  if (inherits(time_vals, "Date") || inherits(time_vals, "POSIXt")) {
    time_numeric <- as.numeric(time_vals)
  } else {
    time_numeric <- as.numeric(time_vals)
  }

  # Determine embargo
  if (is.null(embargo)) {
    if (!is.na(diagnosis@temporal$embargo_minimum)) {
      embargo <- diagnosis@temporal$embargo_minimum
    } else {
      # Fallback: 5% of time range
      embargo <- diff(range(time_numeric, na.rm = TRUE)) * 0.05
    }
  }

  if (verbose) message(sprintf("Using temporal embargo: %.2f", embargo))

  # Sort by time
  ord <- order(time_numeric)

  # Create temporal folds with embargo
  # Use forward chaining approach
  fold_boundaries <- round(seq(0, n, length.out = v + 1))

  folds <- lapply(1:v, function(i) {
    # For fold i, test on block i, train on blocks before
    test_start <- fold_boundaries[i] + 1
    test_end <- fold_boundaries[i + 1]
    test_idx <- ord[test_start:test_end]

    # Training: all data before test block (minus embargo)
    if (test_start > 1) {
      test_min_time <- min(time_numeric[test_idx], na.rm = TRUE)
      train_mask <- time_numeric[ord[1:(test_start - 1)]] < (test_min_time - embargo)
      train_idx <- ord[1:(test_start - 1)][train_mask]
    } else {
      train_idx <- integer(0)
    }

    # If no training data, use later data (expanding window)
    if (length(train_idx) < 10) {
      # Use later blocks as training instead
      if (test_end < n) {
        test_max_time <- max(time_numeric[test_idx], na.rm = TRUE)
        after_mask <- time_numeric[ord[(test_end + 1):n]] > (test_max_time + embargo)
        train_idx <- c(train_idx, ord[(test_end + 1):n][after_mask])
      }
    }

    list(train = train_idx, test = test_idx)
  })

  # Filter out folds with insufficient training data
  valid_folds <- Filter(function(f) length(f$train) >= 10, folds)

  if (length(valid_folds) < 2) {
    warning("Temporal blocking with embargo produced < 2 valid folds. ",
            "Consider reducing embargo or using a different strategy.")
    # Fallback to simple temporal split without embargo
    folds <- lapply(1:v, function(i) {
      test_start <- fold_boundaries[i] + 1
      test_end <- fold_boundaries[i + 1]
      test_idx <- ord[test_start:test_end]
      train_idx <- setdiff(ord, test_idx)
      list(train = train_idx, test = test_idx)
    })
  } else {
    folds <- valid_folds
  }

  folds
}


#' Generate Group Folds (Internal)
#' @noRd
generate_group_folds <- function(data, v, diagnosis, group_col, verbose) {
  # Get group column
  if (is.null(group_col)) {
    group_col <- diagnosis@clustered$group_col
  }
  if (is.null(group_col)) {
    stop("Group CV requires group column. Provide 'groups' argument.")
  }

  groups <- data[[group_col]]
  unique_groups <- unique(groups)
  n_groups <- length(unique_groups)

  if (n_groups < v) {
    warning(sprintf("Only %d groups available, reducing to %d folds", n_groups, n_groups))
    v <- n_groups
  }

  if (verbose) message(sprintf("Creating %d-fold group CV with %d groups", v, n_groups))

  # Assign groups to folds (balanced by total observations)
  group_sizes <- table(groups)
  group_order <- order(group_sizes, decreasing = TRUE)

  fold_assignment <- rep(NA_integer_, n_groups)
  names(fold_assignment) <- unique_groups
  fold_sizes <- rep(0L, v)

  for (g in unique_groups[group_order]) {
    target_fold <- which.min(fold_sizes)
    fold_assignment[g] <- target_fold
    fold_sizes[target_fold] <- fold_sizes[target_fold] + group_sizes[g]
  }

  # Create folds
  lapply(1:v, function(i) {
    test_groups <- names(fold_assignment)[fold_assignment == i]
    test_idx <- which(groups %in% test_groups)
    train_idx <- which(!groups %in% test_groups)
    list(train = train_idx, test = test_idx)
  })
}


#' Generate Spatial-Temporal Folds (Internal)
#' @noRd
generate_spatial_temporal_folds <- function(data, v, diagnosis, coords, time_col, verbose) {
  # Hybrid approach: temporal blocks with spatial buffer
  if (is.null(time_col)) time_col <- diagnosis@temporal$time_col
  if (is.null(coords)) coords <- diagnosis@spatial$coords_used

  # Primary split by time
  temporal_folds <- generate_temporal_block_folds(data, v, diagnosis, time_col, NULL, verbose)

  # For each fold, apply additional spatial buffer
  if (!is.null(coords)) {
    x_coord <- data[[coords[1]]]
    y_coord <- data[[coords[2]]]

    buffer_dist <- if (!is.na(diagnosis@spatial$range_estimate)) {
      diagnosis@spatial$range_estimate
    } else {
      # 10% of spatial extent
      max(diff(range(x_coord, na.rm = TRUE)), diff(range(y_coord, na.rm = TRUE))) * 0.1
    }

    temporal_folds <- lapply(temporal_folds, function(fold) {
      test_x <- x_coord[fold$test]
      test_y <- y_coord[fold$test]

      # Remove training points too close to test points
      train_keep <- vapply(fold$train, function(i) {
        min_dist <- min(sqrt((x_coord[i] - test_x)^2 + (y_coord[i] - test_y)^2))
        min_dist > buffer_dist
      }, logical(1))

      list(train = fold$train[train_keep], test = fold$test)
    })
  }

  temporal_folds
}


#' Generate Spatial-Group Folds (Internal)
#' @noRd
generate_spatial_group_folds <- function(data, v, diagnosis, coords, group_col, verbose) {
  # Use groups as primary structure, ensure spatial separation
  group_folds <- generate_group_folds(data, v, diagnosis, group_col, verbose)

  # Apply spatial buffer
  if (!is.null(coords) || !is.null(diagnosis@spatial$coords_used)) {
    if (is.null(coords)) coords <- diagnosis@spatial$coords_used

    x_coord <- data[[coords[1]]]
    y_coord <- data[[coords[2]]]

    buffer_dist <- if (!is.na(diagnosis@spatial$range_estimate)) {
      diagnosis@spatial$range_estimate
    } else {
      max(diff(range(x_coord, na.rm = TRUE)), diff(range(y_coord, na.rm = TRUE))) * 0.1
    }

    group_folds <- lapply(group_folds, function(fold) {
      test_x <- x_coord[fold$test]
      test_y <- y_coord[fold$test]

      train_keep <- vapply(fold$train, function(i) {
        min_dist <- min(sqrt((x_coord[i] - test_x)^2 + (y_coord[i] - test_y)^2))
        min_dist > buffer_dist
      }, logical(1))

      list(train = fold$train[train_keep], test = fold$test)
    })
  }

  group_folds
}


#' Generate Temporal-Group Folds (Internal)
#' @noRd
generate_temporal_group_folds <- function(data, v, diagnosis, time_col, group_col, verbose) {
  # Group-out with temporal ordering
  if (is.null(time_col)) time_col <- diagnosis@temporal$time_col
  if (is.null(group_col)) group_col <- diagnosis@clustered$group_col

  groups <- data[[group_col]]
  time_vals <- data[[time_col]]

  # Order groups by their mean time
  group_times <- tapply(as.numeric(time_vals), groups, mean, na.rm = TRUE)
  group_order <- names(sort(group_times))

  unique_groups <- group_order
  n_groups <- length(unique_groups)

  if (n_groups < v) {
    v <- n_groups
  }

  # Assign groups to folds in temporal order
  fold_assignment <- rep(1:v, length.out = n_groups)
  names(fold_assignment) <- unique_groups

  lapply(1:v, function(i) {
    test_groups <- names(fold_assignment)[fold_assignment == i]
    test_idx <- which(groups %in% test_groups)
    train_idx <- which(!groups %in% test_groups)
    list(train = train_idx, test = test_idx)
  })
}


#' Generate Temporal Expanding Window Folds (Internal)
#'
#' Forward-chaining: training window grows, test window slides forward.
#' @noRd
generate_temporal_expanding_folds <- function(data, v, diagnosis, time_col, embargo, verbose) {
  if (is.null(time_col)) time_col <- diagnosis@temporal$time_col
  if (is.null(time_col)) {
    stop("Expanding window CV requires time column. Provide 'time' argument.")
  }

  time_vals <- data[[time_col]]
  n <- nrow(data)
  time_numeric <- as.numeric(time_vals)
  ord <- order(time_numeric)

  if (is.null(embargo)) {
    if (!is.na(diagnosis@temporal$embargo_minimum)) {
      embargo <- diagnosis@temporal$embargo_minimum
    } else {
      embargo <- 0
    }
  }

  # Minimum initial training size: 2x test size
  test_size <- floor(n / (v + 1))
  initial_train_size <- max(test_size * 2, floor(n / (v + 1)))

  if (verbose) {
    message(sprintf("Expanding window: initial train = %d, test size = %d, embargo = %s",
                    initial_train_size, test_size, embargo))
  }

  folds <- list()
  for (i in seq_len(v)) {
    train_end <- initial_train_size + (i - 1) * test_size
    test_start <- train_end + 1
    test_end <- min(test_start + test_size - 1, n)

    if (test_start > n || train_end > n) break

    train_idx <- ord[1:train_end]
    test_idx <- ord[test_start:test_end]

    # Apply embargo: remove training points too close in time to test
    if (embargo > 0 && length(test_idx) > 0 && length(train_idx) > 0) {
      test_min_time <- min(time_numeric[test_idx], na.rm = TRUE)
      keep <- time_numeric[train_idx] < (test_min_time - embargo)
      train_idx <- train_idx[keep]
    }

    if (length(train_idx) >= 10 && length(test_idx) >= 1) {
      folds <- c(folds, list(list(train = train_idx, test = test_idx)))
    }
  }

  if (length(folds) < 2) {
    warning("Expanding window produced < 2 valid folds. Consider fewer folds or smaller embargo.")
  }

  folds
}


#' Generate Temporal Sliding Window Folds (Internal)
#'
#' Fixed-size training window slides forward with the test window.
#' @noRd
generate_temporal_sliding_folds <- function(data, v, diagnosis, time_col, embargo, verbose) {
  if (is.null(time_col)) time_col <- diagnosis@temporal$time_col
  if (is.null(time_col)) {
    stop("Sliding window CV requires time column. Provide 'time' argument.")
  }

  time_vals <- data[[time_col]]
  n <- nrow(data)
  time_numeric <- as.numeric(time_vals)
  ord <- order(time_numeric)

  if (is.null(embargo)) {
    if (!is.na(diagnosis@temporal$embargo_minimum)) {
      embargo <- diagnosis@temporal$embargo_minimum
    } else {
      embargo <- 0
    }
  }

  # Fixed window size: enough for each fold to have train + test
  test_size <- floor(n / (v + 1))
  window_size <- test_size * 3  # Training window = 3x test size
  step_size <- floor((n - window_size - test_size) / max(v - 1, 1))
  step_size <- max(step_size, 1)

  if (verbose) {
    message(sprintf("Sliding window: window = %d, test = %d, step = %d, embargo = %s",
                    window_size, test_size, step_size, embargo))
  }

  folds <- list()
  for (i in seq_len(v)) {
    train_start <- 1 + (i - 1) * step_size
    train_end <- min(train_start + window_size - 1, n)
    test_start <- train_end + 1
    test_end <- min(test_start + test_size - 1, n)

    if (test_start > n || train_end > n) break

    train_idx <- ord[train_start:train_end]
    test_idx <- ord[test_start:test_end]

    # Apply embargo
    if (embargo > 0 && length(test_idx) > 0 && length(train_idx) > 0) {
      test_min_time <- min(time_numeric[test_idx], na.rm = TRUE)
      keep <- time_numeric[train_idx] < (test_min_time - embargo)
      train_idx <- train_idx[keep]
    }

    if (length(train_idx) >= 10 && length(test_idx) >= 1) {
      folds <- c(folds, list(list(train = train_idx, test = test_idx)))
    }
  }

  if (length(folds) < 2) {
    warning("Sliding window produced < 2 valid folds. Consider fewer folds or smaller window.")
  }

  folds
}


# ============================================================================
# Output format converters
# ============================================================================

#' Convert to rsample Format (Internal)
#' @noRd
convert_to_rsample <- function(borg_cv_obj, data) {
  if (!requireNamespace("rsample", quietly = TRUE)) {
    stop("Package 'rsample' required for output='rsample'. Install with: install.packages('rsample')")
  }

  # Create manual_rset
  splits <- lapply(borg_cv_obj$folds, function(fold) {
    rsample::make_splits(
      x = list(analysis = fold$train, assessment = fold$test),
      data = data
    )
  })

  ids <- paste0("Fold", seq_along(splits))

  rsample::manual_rset(splits, ids)
}


#' Convert to caret Format (Internal)
#' @noRd
convert_to_caret <- function(borg_cv_obj) {
  if (!requireNamespace("caret", quietly = TRUE)) {
    stop("Package 'caret' required for output='caret'. Install with: install.packages('caret')")
  }

  # Create index and indexOut lists
  index <- lapply(borg_cv_obj$folds, function(f) f$train)
  indexOut <- lapply(borg_cv_obj$folds, function(f) f$test)

  names(index) <- paste0("Fold", seq_along(index))
  names(indexOut) <- paste0("Fold", seq_along(indexOut))

  caret::trainControl(
    method = "cv",
    index = index,
    indexOut = indexOut,
    savePredictions = "final"
  )
}


#' Convert to mlr3 Format (Internal)
#' @noRd
convert_to_mlr3 <- function(borg_cv_obj, data) {
  if (!requireNamespace("mlr3", quietly = TRUE)) {
    stop("Package 'mlr3' required for output='mlr3'. Install with: install.packages('mlr3')")
  }

  # Create custom resampling
  train_sets <- lapply(borg_cv_obj$folds, function(f) f$train)
  test_sets <- lapply(borg_cv_obj$folds, function(f) f$test)

  mlr3::rsmp("custom")$instantiate(
    mlr3::TaskClassif$new(id = "temp", backend = data, target = names(data)[1]),
    train_sets = train_sets,
    test_sets = test_sets
  )
}

Try the BORG package in your browser

Any scripts or data that you put into this service are public.

BORG documentation built on March 20, 2026, 5:09 p.m.