R/cv_buffer.R

Defines functions .ttt summary.cv_buffer plot.cv_buffer print.cv_buffer cv_buffer

Documented in cv_buffer

#' Use buffer around records to separate train and test folds (a.k.a. buffered/spatial leave-one-out)
#'
#' This function generates spatially separated train and test folds by considering buffers of
#' the specified distance (\code{size} parameter) around each observation point.
#' This approach is a form of \emph{leave-one-out} cross-validation. Each fold is generated by excluding
#' nearby observations around each testing point within the specified distance (ideally the range of
#' spatial autocorrelation, see \code{\link{cv_spatial_autocor}}). In this method, the testing set never
#' directly abuts a training sample (e.g. presence or absence; 0s and 1s). For more information see the details section.
#'
#' When working with presence-background (presence and pseudo-absence) species distribution
#' data (should be specified by \code{presence_bg = TRUE} argument), only presence records are used
#' for specifying the folds (recommended). Consider a target presence point. The buffer is defined around this target point,
#' using the specified range (\code{size}). By default, the testing fold comprises only the target presence point (all background
#' points within the buffer are also added when \code{add_bg = TRUE}).
#' Any non-target presence points inside the buffer are excluded.
#' All points (presence and background) outside of buffer are used for the training set.
#' The methods cycles through all the \emph{presence} data, so the number of folds is equal to
#' the number of presence points in the dataset.
#'
#' For presence-absence data (and all other types of data), folds are created based on all records, both
#' presences and absences. As above, a target observation (presence or absence) forms a test point, all
#' presence and absence points other than the target point within the buffer are ignored, and the training
#' set comprises all presences and absences outside the buffer. Apart from the folds, the number
#' of \emph{training-presence}, \emph{training-absence}, \emph{testing-presence} and \emph{testing-absence}
#' records is stored and returned in the \code{records} table. If \code{column = NULL} and \code{presence_bg = FALSE},
#' the procedure is like presence-absence data. All other data types (continuous, count or multi-class responses) should be
#' done by \code{presence_bg = FALSE}.
#'
#'
#' @inheritParams cv_spatial
#' @param column character; indicating the name of the column in which response variable (e.g. species data as a binary
#'  response i.e. 0s and 1s) is stored. This is required when \code{presence_bg = TRUE}, otherwise optional.
#' @param size numeric value of the specified range by which training/testing data are separated.
#' This distance should be in \strong{metres}. The range could be explored by \code{\link{cv_spatial_autocor}}.
#' @param presence_bg logical; whether to treat data as species presence-background data. For all other data
#' types (presence-absence, continuous, count or multi-class responses), this option should be \code{FALSE}.
#' @param add_bg logical; add background points to the test set when \code{presence_bg = TRUE}. We do not
#' recommend this according to Radosavljevic & Anderson (2014). Keep it \code{FALSE}, unless you mean to add
#' the background pints to testing points.
#' @param progress logical; whether to shows a progress bar.
#' @param report logical; whether to generate print summary of records in each fold; for very big
#' datasets, set to \code{FALSE} for faster calculation.
#'
#' @seealso \code{\link{cv_nndm}}, \code{\link{cv_spatial}}, and \code{\link{cv_spatial_autocor}}
#'
#' @references Radosavljevic, A., & Anderson, R. P. (2014). Making better Maxent models of species
#' distributions: Complexity, overfitting and evaluation. Journal of Biogeography, 41, 629–643. https://doi.org/10.1111/jbi.12227
#'
#' @return An object of class S3. A list of objects including:
#'     \itemize{
#'     \item{folds_list - a list containing the folds. Each fold has two vectors with the training (first) and testing (second) indices}
#'     \item{k - number of the folds}
#'     \item{size - the defined range of spatial autocorrelation)}
#'     \item{column - the name of the column if provided}
#'     \item{presence_bg - whether this was treated as presence-background data}
#'     \item{records - a table with the number of points in each category of training and testing}
#'     }
#' @export
#'
#' @examples
#' \donttest{
#' library(blockCV)
#'
#' # import presence-absence species data
#' points <- read.csv(system.file("extdata/", "species.csv", package = "blockCV"))
#' # make an sf object from data.frame
#' pa_data <- sf::st_as_sf(points, coords = c("x", "y"), crs = 7845)
#'
#' bloo <- cv_buffer(x = pa_data,
#'                   column = "occ",
#'                   size = 350000, # size in metres no matter the CRS
#'                   presence_bg = FALSE)
#'
#' }
cv_buffer <- function(
    x,
    column = NULL,
    size,
    presence_bg = FALSE,
    add_bg = FALSE,
    progress = TRUE,
    report = TRUE
){

  # check x is an sf object
  x <- .check_x(x)
  # is column in x?
  column <- .check_column(column, x)

  # x's CRS must be defined
  if(is.na(sf::st_crs(x))){
    stop("The coordinate reference system of 'x' must be defined.")
  }

  if(is.null(column) && presence_bg) stop("'column' must be provided for presence-background data.")


  # distance matrix by sf
  dmatrix <- sf::st_distance(x)
  units(dmatrix) <- NULL

  if(presence_bg){
    unqsp <- unique(x[, column, drop = TRUE])
    if(!is.numeric(unqsp) || any(unqsp < 0) || any(unqsp > 1)){
      stop("Presence-background option is only for species data with 0s (backgrounds/pseudo-absences) and 1s (presences).\n", "The data should be numeric.\n")
    }
    # indices of presences
    x_1s <- which(x[, column, drop = TRUE] == 1)
  } else{
    x_1s <- 1:nrow(x)
  }

  # the k
  n <- length(x_1s)

  # add background only if both true
  add_bg <- (presence_bg && add_bg)

  if(progress) pb <- utils::txtProgressBar(min = 0, max = n, style = 3)
  fold_list <- lapply(x_1s, function(i, pbag = add_bg){
    if(pbag){
      test_ids <- which(dmatrix[i, ] <= size)
      inside <- x[test_ids, column, drop = TRUE]
      test_set <- test_ids[which(inside == 0)]
      test_set <- c(test_set, i) # change this back
    } else{
      test_set <- i
    }
    if(progress) utils::setTxtProgressBar(pb, i)
    list(as.numeric(which(dmatrix[i, ] > size)),
         as.numeric(test_set))
  }
  )

  # calculate train test table summary
  if(report){
    train_test_table <- .ttt(fold_list, x, column, n)
    print(summary(train_test_table)[c(1,4,6), ])
  }

  final_objs <- list(
    folds_list = fold_list,
    k = n,
    column = column,
    size = size,
    presence_bg = presence_bg,
    records = if(report) train_test_table else NULL
  )

  class(final_objs) <- c("cv_buffer")
  return(final_objs)
}


#' @export
#' @method print cv_buffer
print.cv_buffer <- function(x, ...){
  print(class(x))
}

#' @export
#' @method plot cv_buffer
plot.cv_buffer <- function(x, ...){
  message("Please use cv_plot function to plot each fold.")
}

#' @export
#' @method summary cv_buffer
summary.cv_buffer <- function(object, ...){
  if(!is.null(object$records)){
    print(summary(object$records)[c(1,4,6),])
  }
}


# count the train and test records
.ttt <- function(fold_list, x, column, n){
  if(is.null(column)){
    tt_count <- base::data.frame(train = rep(0, n), test = 0)
    for(i in seq_len(n)){
      train_set <- fold_list[[i]][[1]]
      test_set <- fold_list[[i]][[2]]
      tt_count$train[i] <- length(train_set)
      tt_count$test[i] <- length(test_set)
    }
  } else{
    cl <- sort(unique(x[, column, drop = TRUE]))
    clen <- length(cl)
    .check_classes(clen, column) # column should be binary or categorical
    tt_count <- as.data.frame(matrix(0, nrow = n, ncol = clen * 2))
    names(tt_count) <- c(paste("train", cl, sep = "_"), paste("test", cl, sep = "_"))
    for(i in seq_len(n)){
      train_set <- fold_list[[i]][[1]]
      test_set <- fold_list[[i]][[2]]
      countrain <- table(x[train_set, column, drop = TRUE])
      countest <- table(x[test_set, column, drop = TRUE])
      tt_count[i, which(cl %in% names(countrain))] <- countrain
      tt_count[i, clen + which(cl %in% names(countest))] <- countest
    }
  }

  return(tt_count)
}
rvalavi/blockCV documentation built on May 4, 2024, 2 a.m.