R/check_split_balance.R

Defines functions check_splits_balance

Documented in check_splits_balance

#' Check the balance of presences vs pseudoabsences among splits
#'
#' @param splits the data splits (an `rset` or `split` object), generated by a
#'   function such as [spatialsample::spatial_block_cv()]
#' @param .col the column containing the presences
#' @returns a tibble of the number of presences and pseudoabsences in the
#'   assessment and analysis set of each split (or training and testing in an
#'   initial split)
#' @export
#' @examples
#' lacerta_thin <- readRDS(system.file("extdata/lacerta_thin_all_vars.rds",
#'   package = "tidysdm"
#' ))
#' lacerta_cv <- spatial_block_cv(lacerta_thin, v = 5)
#' check_splits_balance(lacerta_cv, class)
#'
check_splits_balance <- function(splits, .col) {
  .col <- rlang::enquo(.col) %>%
    rlang::quo_get_expr() %>%
    rlang::as_string()

  if (inherits(splits, "rset")) {
    if (!(.col %in% names(splits$splits[[1]]$data))) {
      stop(".col should be a column in the data used to generate the splits")
    }
    training_list <- lapply(splits$splits, function(x) {
      table(
        rsample::training(x) %>%
          sf::st_drop_geometry() %>%
          dplyr::pull(.col)
      )
    })
    training_df <- do.call("rbind", training_list)
    testing_list <- lapply(splits$splits, function(x) {
      table(
        rsample::testing(x) %>%
          sf::st_drop_geometry() %>%
          dplyr::pull(.col)
      )
    })
    testing_df <- do.call("rbind", testing_list)
    dimnames(testing_df)[[2]] <- paste0(dimnames(testing_df)[[2]], "_analysis")
    dimnames(training_df)[[2]] <- paste0(
      dimnames(training_df)[[2]],
      "_assessment"
    )

    balance_df <- dplyr::bind_cols(training_df, testing_df)
  } else if (inherits(splits, "rsplit")) {
    if (!(.col %in% names(splits$data))) {
      stop(".col should be a column in the data used to generate the splits")
    }
    training_df <- table(
      rsample::training(splits) %>%
        sf::st_drop_geometry() %>%
        dplyr::pull(.col)
    )
    testing_df <- table(
      rsample::testing(splits) %>%
        sf::st_drop_geometry() %>%
        dplyr::pull(.col)
    )
    # coerce to a df with only one row
    balance_df <- as.data.frame(matrix(c(training_df, testing_df), nrow = 1))
    names(balance_df) <- c(
      paste0(names(testing_df), "_test"),
      paste0(names(training_df), "_train")
    )
    balance_df <- tibble::as_tibble(balance_df)
  } else {
    stop("splits should be either a spatial_rset or a spatial_rsplit")
  }

  return(balance_df)
}

Try the tidysdm package in your browser

Any scripts or data that you put into this service are public.

tidysdm documentation built on April 3, 2025, 9:56 p.m.