R/trainValTest.R

Defines functions trainValTest

Documented in trainValTest

#' Train, Validation and Test datasets
#'
#' Split a dataset randomly in training and testing datasets or training,
#' validation and testing datasets.
#'
#' @param x \linkS4class{SWD} object containing the data that have to be split
#' in training, validation and testing datasets.
#' @param test numeric. The percentage of data withhold for testing.
#' @param val numeric. The percentage of data withhold for validation, default
#' is `0`.
#' @param only_presence logical. If `TRUE` the split is done only for the
#' presence locations and all the background locations are included in each
#' partition, used manly for presence-only methods, default is `FALSE`.
#' @param seed numeric. The value used to set the seed in order to have
#' consistent results, default is `NULL`.
#'
#' @details When `only_presence = FALSE`, the proportion of presence and
#' absence is preserved.
#'
#' @return A list with the training, validation and testing or training and
#' testing \linkS4class{SWD} objects accordingly.
#' @export
#'
#' @author Sergio Vignali
#'
#' @examples
#' # Acquire environmental variables
#' files <- list.files(path = file.path(system.file(package = "dismo"), "ex"),
#'                     pattern = "grd",
#'                     full.names = TRUE)
#'
#' predictors <- terra::rast(files)
#'
#' # Prepare presence and background locations
#' p_coords <- virtualSp$presence
#' bg_coords <- virtualSp$background
#'
#' # Create SWD object
#' data <- prepareSWD(species = "Virtual species",
#'                    p = p_coords,
#'                    a = bg_coords,
#'                    env = predictors,
#'                    categorical = "biome")
#'
#' # Split presence locations in training (80%) and testing (20%) datasets
#' # and splitting only the presence locations
#' datasets <- trainValTest(data,
#'                          test = 0.2,
#'                          only_presence = TRUE)
#' train <- datasets[[1]]
#' test <- datasets[[2]]
#'
#' # Split presence locations in training (60%), validation (20%) and testing
#' # (20%) datasets and splitting the presence and the absence locations
#' datasets <- trainValTest(data,
#'                          val = 0.2,
#'                          test = 0.2)
#' train <- datasets[[1]]
#' val <- datasets[[2]]
#' test <- datasets[[3]]
trainValTest <- function(x,
                         test,
                         val = 0,
                         only_presence = FALSE,
                         seed = NULL) {

  if (!inherits(x, "SWD"))
    cli::cli_abort(c(
      "!" = "{.var x} must be an {.cls SWD} object",
      "x" = "You have supplied a {.cls {class(x)}} instead."
    ))

  if (!is.null(seed))
    set.seed(seed)

  np <- sum(x@pa == 1)
  na <- sum(x@pa == 0)
  p_perm <- sample(which(x@pa == 1))
  a_perm <- sample(which(x@pa == 0))

  if (val > 0) {
    start_p_val <- round(np * (1 - val - test), 0) + 1
    start_a_val <- round(na * (1 - val - test), 0) + 1
    start_p_test <- round(np * val, 0) + start_p_val
    start_a_test <- round(na * val, 0) + start_a_val
    train <- val <- test <- SWD()

    # Training presences
    train@species <- x@species
    train@data <- x@data[p_perm[1:(start_p_val - 1)], ]
    train@coords <- x@coords[p_perm[1:(start_p_val - 1)], ]
    train@pa <- c(rep(1, nrow(train@coords)))

    # Validation presences
    val@species <- x@species
    val@data <- x@data[p_perm[start_p_val:(start_p_test - 1)], ]
    val@coords <- x@coords[p_perm[start_p_val:(start_p_test - 1)], ]
    val@pa <- c(rep(1, nrow(val@coords)))

    # Testind SWD
    test@species <- x@species
    test@data <- x@data[p_perm[start_p_test:np], ]
    test@coords <- x@coords[p_perm[start_p_test:np], ]
    test@pa <- c(rep(1, nrow(test@coords)))
    if (only_presence) {
      # Training absences
      train@data <- rbind(train@data, x@data[x@pa == 0, ])
      train@coords <- rbind(train@coords, x@coords[x@pa == 0, ])
      train@pa <- c(train@pa, rep(0, na))
      # Validation absences
      val@data <- rbind(val@data, x@data[x@pa == 0, ])
      val@coords <- rbind(val@coords, x@coords[x@pa == 0, ])
      val@pa <- c(val@pa, rep(0, na))
      # Testing absences
      test@data <- rbind(test@data, x@data[x@pa == 0, ])
      test@coords <- rbind(test@coords, x@coords[x@pa == 0, ])
      test@pa <- c(test@pa, rep(0, na))
    } else {
      # Training absences
      train@data <- rbind(train@data, x@data[a_perm[1:(start_a_val - 1)], ])
      train@coords <- rbind(train@coords,
                            x@coords[a_perm[1:(start_a_val - 1)], ])
      train@pa <- c(train@pa, rep(0, (nrow(train@coords) - length(train@pa))))
      # Validation absences
      val@data <- rbind(val@data,
                        x@data[a_perm[start_a_val:(start_a_test - 1)], ])
      val@coords <- rbind(val@coords,
                          x@coords[a_perm[start_a_val:(start_a_test - 1)], ])
      val@pa <- c(val@pa, rep(0, (nrow(val@coords) - length(val@pa))))
      # Testing absences
      test@data <- rbind(test@data, x@data[a_perm[start_a_test:na], ])
      test@coords <- rbind(test@coords, x@coords[a_perm[start_a_test:na], ])
      test@pa <- c(test@pa, rep(0, (nrow(test@coords) - length(test@pa))))
    }
    # Reset row names
    rownames(train@data) <- NULL
    rownames(train@coords) <- NULL
    rownames(val@data) <- NULL
    rownames(val@coords) <- NULL
    rownames(test@data) <- NULL
    rownames(test@coords) <- NULL

    return(list(train, val, test))

  } else {
    start_p_test <- round(np * (1 - test), 0) + 1
    start_a_test <- round(na * (1 - test), 0) + 1
    train <- test <- SWD()

    # Training presences
    train@species <- x@species
    train@data <- x@data[p_perm[1:(start_p_test - 1)], ]
    train@coords <- x@coords[p_perm[1:(start_p_test - 1)], ]
    train@pa <- c(rep(1, nrow(train@coords)))

    # Testing presences
    test@species <- x@species
    test@data <- x@data[p_perm[start_p_test:np], ]
    test@coords <- x@coords[p_perm[start_p_test:np], ]
    test@pa <- c(rep(1, nrow(test@coords)))

    if (only_presence) {
      # Training absences
      train@data <- rbind(train@data, x@data[x@pa == 0, ])
      train@coords <- rbind(train@coords, x@coords[x@pa == 0, ])
      train@pa <- c(train@pa, rep(0, na))
      # Testing absences
      test@data <- rbind(test@data, x@data[x@pa == 0, ])
      test@coords <- rbind(test@coords, x@coords[x@pa == 0, ])
      test@pa <- c(test@pa, rep(0, na))
    } else {
      # Training absences
      train@data <- rbind(train@data, x@data[a_perm[1:(start_a_test - 1)], ])
      train@coords <- rbind(train@coords,
                            x@coords[a_perm[1:(start_a_test - 1)], ])
      train@pa <- c(train@pa, rep(0, (nrow(train@coords) - length(train@pa))))
      # Testing absences
      test@data <- rbind(test@data, x@data[a_perm[start_a_test:na], ])
      test@coords <- rbind(test@coords,  x@coords[a_perm[start_a_test:na], ])
      test@pa <- c(test@pa, rep(0, (nrow(test@coords) - length(test@pa))))
    }
    # Reset row names
    rownames(train@data) <- NULL
    rownames(train@coords) <- NULL
    rownames(test@data) <- NULL
    rownames(test@coords) <- NULL

    return(list(train, test))
  }
}

Try the SDMtune package in your browser

Any scripts or data that you put into this service are public.

SDMtune documentation built on July 9, 2023, 6:03 p.m.