#' @title Partition the data for a (non-spatial) cross-validation
#'
#' @description `partition_cv` creates a [represampling] object for
#' `length(repetition)`-repeated `nfold`-fold cross-validation.
#'
#' @param data `data.frame` containing at least the columns specified by
#' `coords`
#' @param coords (ignored by `partition_cv`)
#' @param nfold number of partitions (folds) in `nfold`-fold cross-validation
#' partitioning
#' @param repetition numeric vector: cross-validation repetitions to be
#' generated. Note that this is not the number of repetitions, but the indices
#' of these repetitions. E.g., use `repetition = c(1:100)` to obtain (the
#' 'first') 100 repetitions, and `repetition = c(101:200)` to obtain a
#' different set of 100 repetitions.
#' @param seed1 `seed1+i` is the random seed that will be used by [set.seed] in
#' repetition `i` (`i` in `repetition`) to initialize the random number
#' generator before sampling from the data set.
#' @param return_factor if `FALSE` (default), return a [represampling] object;
#' if `TRUE` (used internally by other {sperrorest} functions), return a
#' `list` containing factor vectors (see Value)
#'
#' @details This function does not actually perform a cross-validation or
#' partition the data set itself; it simply creates a data structure
#' containing the indices of training and test samples.
#'
#' @return If `return_factor = FALSE` (the default), a [represampling] object.
#' Specifically, this is a (named) list of `length(repetition)` `resampling`
#' objects. Each of these [resampling] objects is a list of length `nfold`
#' corresponding to the folds. Each fold is represented by a list of
#' containing the components `train` and `test`, specifying the indices of
#' training and test samples (row indices for `data`). If `return_factor =
#' TRUE` (mainly used internally), a (named) list of length
#' `length(repetition)`. Each component of this list is a vector of length
#' `nrow(data)` of type `factor`, specifying for each sample the fold to which
#' it belongs. The factor levels are `factor(1:nfold)`.
#'
#' @seealso [sperrorest], [represampling]
#'
#' @examples
#' data(ecuador)
#' ## non-spatial cross-validation:
#' resamp <- partition_cv(ecuador, nfold = 5, repetition = 5)
#' # plot(resamp, ecuador)
#' # first repetition, second fold, test set indices:
#' idx <- resamp[["1"]][[2]]$test
#' # test sample used in this particular repetition and fold:
#' ecuador[idx, ]
#' @name partition_cv
#' @export
partition_cv <- function(data,
coords = c("x", "y"),
nfold = 10, repetition = 1,
seed1 = NULL,
return_factor = FALSE) {
resampling <- list()
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
resampler <- sample(rep(sample(nfold), length = nrow(data)),
size = nrow(data)
)
resampler <- factor(resampler)
if (!return_factor) {
resampler <- as.resampling(resampler) # nolint
}
resampling[[as.character(cnt)]] <- resampler
}
if (!return_factor) {
resampling <- as.represampling(resampling) # nolint
}
return(resampling)
}
#' @title Partition the data for a stratified (non-spatial) cross-validation
#'
#' @description `partition_cv_strat` creates a set of sample indices
#' corresponding to cross-validation test and training sets.
#'
#' @name partition_cv_strat
#' @inheritParams partition_cv
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations
#' @param strat character: column in `data` containing a factor variable over
#' which the partitioning should be stratified; or factor vector of length
#' `nrow(data)`: variable over which to stratify
#'
#' @return A [represampling] object, see also [partition_cv()].
#' `partition_strat_cv`, however, stratified with respect to the variable
#' `data[,strat]`; i.e., cross-validation partitioning is done within each set
#' `data[data[,strat]==i,]` (`i` in `levels(data[, strat])`), and the `i`th
#' folds of all levels are combined into one cross-validation fold.
#'
#' @seealso [sperrorest()], [as.resampling()], [resample_strat_uniform()]
#'
#' @examples
#' data(ecuador)
#' parti <- partition_cv_strat(ecuador,
#' strat = "slides", nfold = 5,
#' repetition = 1
#' )
#' idx <- parti[["1"]][[1]]$train
#' mean(ecuador$slides[idx] == "TRUE") / mean(ecuador$slides == "TRUE")
#' # always == 1
#' # Non-stratified cross-validation:
#' parti <- partition_cv(ecuador, nfold = 5, repetition = 1)
#' idx <- parti[["1"]][[1]]$train
#' mean(ecuador$slides[idx] == "TRUE") / mean(ecuador$slides == "TRUE")
#' # close to 1 because of large sample size, but with some random variation
#' @export
partition_cv_strat <- function(data, coords = c("x", "y"), nfold = 10,
return_factor = FALSE, repetition = 1,
seed1 = NULL, strat) {
repres <- list()
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
stopifnot((length(strat) == 1) | (length(strat) == nrow(data))) # nolint
if (length(strat) == 1) {
strat <- data[, strat]
}
stopifnot(is.factor(strat))
# Can't split into nfold partitions if there are less than nfold samples
# within a stratum:
minstrat <- min(tapply(strat, strat, length))
stopifnot(minstrat >= nfold)
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
fac <- rep(NA, nrow(data))
for (lev in levels(strat)) {
nstrat <- sum(sel <- (strat == lev))
fac[sel] <- sample(rep(sample(nfold), length = nstrat), size = nstrat)
}
fac <- factor(fac)
if (!return_factor) {
fac <- as.resampling(fac) # nolint
}
repres[[as.character(cnt)]] <- fac # nolint
}
if (!return_factor) {
repres <- as.represampling(repres) # nolint
}
return(repres)
}
#' @title Partition the data for a (non-spatial) leave-one-factor-out
#' cross-validation based on a given, fixed partitioning
#'
#' @description `partition_factor` creates a [represampling] object, i.e. a set
#' of sample indices defining cross-validation test and training sets.
#'
#' @inheritParams partition_cv
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations.
#' @param fac either the name of a variable (column) in `data`, or a vector of
#' type factor and length `nrow(data)` that contains the partitions to be used
#' for defining training and test samples.
#'
#' @return A [represampling] object, see also [partition_cv] for details.
#'
#' @note In this partitioning approach, all `repetition`s are identical and
#' therefore pseudo-replications.
#'
#' @seealso [sperrorest], [partition_cv], [as.resampling.factor]
#'
#' @examples
#' data(ecuador)
#' # I don't recommend using this partitioning for cross-validation,
#' # this is only for demonstration purposes:
#' breaks <- quantile(ecuador$dem, seq(0, 1, length = 6))
#' ecuador$zclass <- cut(ecuador$dem, breaks, include.lowest = TRUE)
#' summary(ecuador$zclass)
#' parti <- partition_factor(ecuador, fac = "zclass")
#' # plot(parti,ecuador)
#' summary(parti)
#' @export
partition_factor <- function(data, coords = c("x", "y"), fac,
return_factor = FALSE,
repetition = 1) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
if (length(fac) == 1 && is.character(fac)) {
fac <- data[, fac]
}
fac <- factor(fac)
if (!return_factor) {
fac <- as.resampling(fac) # nolint
}
represmp <- list()
for (cnt in repetition) {
represmp[[as.character(cnt)]] <- fac
}
if (!return_factor) {
represmp <- as.represampling(represmp) # nolint
}
return(represmp)
}
#' @title Partition the data for a (non-spatial) k-fold cross-validation at the
#' group level
#'
#' @description `partition_factor_cv` creates a [represampling] object, i.e. a
#' set of sample indices defining cross-validation test and training sets,
#' where partitions are obtained by resampling at the level of groups of
#' observations as defined by a given factor variable. This can be used, for
#' example, to resample agricultural data that is grouped by fields, at the
#' agricultural field level in order to preserve spatial autocorrelation
#' within fields.
#'
#' @inheritParams partition_cv
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations.
#' @param fac either the name of a variable (column) in `data`, or a vector of
#' type factor and length `nrow(data)` that defines groups or clusters of
#' observations.
#'
#' @return A [represampling] object, see also [partition_cv] for details.
#'
#' @note In this partitioning approach, the number of factor levels in `fac`
#' must be large enough for this factor-level resampling to make sense.
#'
#' @seealso [sperrorest], [partition_cv], [partition_factor],
#' [as.resampling.factor]
#'
#' @export
partition_factor_cv <- function(data, coords = c("x", "y"), fac, nfold = 10,
repetition = 1, seed1 = NULL,
return_factor = FALSE) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
if (length(fac) == 1 && is.character(fac)) {
fac <- data[, fac]
}
fac <- factor(fac)
if (nfold > nlevels(fac)) {
# nocov start
warning("'nfold' should be <= nlevels(fac); using nfold=nlevels(fac)\n")
# nocov end
nfold <- nlevels(fac) # nocov
}
resampling <- list()
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
fac_resampler <- sample(rep(sample(nfold), length = nlevels(fac)),
size = nlevels(fac)
)
names(fac_resampler) <- levels(fac)
resampler <- factor(fac_resampler[fac])
if (!return_factor) {
resampler <- as.resampling(resampler) # nolint
}
resampling[[as.character(cnt)]] <- resampler
}
if (!return_factor) {
resampling <- as.represampling(resampling) # nolint
}
return(resampling)
}
#' @title Partition the study area into rectangular tiles
#'
#' @description `partition_tiles` divides the study area into a specified number
#' of rectangular tiles. Optionally small partitions can be merged with
#' adjacent tiles to achieve a minimum number or percentage of samples in each
#' tile.
#'
#' @inheritParams partition_cv
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations
#' @param dsplit optional vector of length 2: equidistance of splits in
#' (possibly rotated) x direction (`dsplit[1]`) and y direction (`dsplit[2]`)
#' used to define tiles. If `dsplit` is of length 1, its value is recycled.
#' Either `dsplit` or `nsplit` must be specified.
#' @param nsplit optional vector of length 2: number of splits in (possibly
#' rotated) x direction (`nsplit[1]`) and y direction (`nsplit[2]`) used to
#' define tiles. If `nsplit` is of length 1, its value is recycled.
#' @param rotation indicates whether and how the rectangular grid should be
#' rotated; random rotation is only between `-45` and `+45` degrees.
#' @param user_rotation if `rotation='user'`, angles (in degrees) by which the
#' rectangular grid is to be rotated in each repetition. Either a vector of
#' same length as `repetition`, or a single number that will be replicated
#' `length(repetition)` times.
#' @param offset indicates whether and how the rectangular grid should be
#' shifted by an offset.
#' @param user_offset if `offset='user'`, a list (or vector) of two components
#' specifying a shift of the rectangular grid in (possibly rotated) x and y
#' direction. The offset values are relative values, a value of `0.5`
#' resulting in a one-half tile shift towards the left, or upward. If this is
#' a list, its first (second) component refers to the rotated x (y) direction,
#' and both components must have same length as `repetition` (or length 1). If
#' a vector of length 2 (or list components have length 1), the two values
#' will be interpreted as relative shifts in (rotated) x and y direction,
#' respectively, and will therefore be recycled as needed
#' (`length(repetition)` times each).
#' @param reassign logical (default `TRUE`): if `TRUE`, 'small' tiles (as per
#' `min_frac` and `min_n` arguments and [get_small_tiles]) are merged with
#' (smallest) adjacent tiles. If `FALSE`, small tiles are 'eliminated', i.e.
#' set to `NA`.
#' @param min_frac numeric >=0, <1: minimum relative size of partition as
#' percentage of sample; argument passed to [get_small_tiles]. Will be ignored
#' if `NULL`.
#' @param min_n integer >=0: minimum number of samples per partition; argument
#' passed to [get_small_tiles]. Will be ignored if `NULL`.
#' @param iterate argument to be passed to [tile_neighbors]
#'
#' @return A [represampling] object. Contains `length(repetition)` [resampling]
#' objects as repetitions. The exact number of folds / test-set tiles within
#' each [resampling] objects depends on the spatial configuration of the data
#' set and possible cleaning steps (see `min_frac`, `min_n`).
#'
#' @note Default parameter settings may change in future releases. This
#' function, especially the rotation and shifting part of it and the algorithm
#' for cleaning up small tiles is still a bit experimental. Use with caution.
#' For non-zero offsets (`offset!='none')`), the number of tiles may actually
#' be greater than `nsplit[1]*nsplit[2]` because of fractional tiles lurking
#' into the study region. `reassign=TRUE` with suitable thresholds is
#' therefore recommended for non-zero (including random) offsets.
#'
#' @seealso [sperrorest], [as.resampling.factor], [get_small_tiles],
#' [tile_neighbors]
#'
#' @examples
#' data(ecuador)
#' set.seed(42)
#' parti <- partition_tiles(ecuador, nsplit = c(4, 3), reassign = FALSE)
#' # plot(parti,ecuador)
#' # tile A4 has only 55 samples
#' # same partitioning, but now merge tiles with less than 100 samples to
#' # adjacent tiles:
#' parti2 <- partition_tiles(ecuador,
#' nsplit = c(4, 3), reassign = TRUE,
#' min_n = 100
#' )
#' # plot(parti2,ecuador)
#' summary(parti2)
#' # tile B4 (in 'parti') was smaller than A3, therefore A4 was merged with B4,
#' # not with A3
#' # now with random rotation and offset, and tiles of 2000 m length:
#' parti3 <- partition_tiles(ecuador,
#' dsplit = 2000, offset = "random",
#' rotation = "random", reassign = TRUE, min_n = 100
#' )
#' # plot(parti3, ecuador)
#' summary(parti3)
#' @export
partition_tiles <- function(data, coords = c("x", "y"), dsplit = NULL,
nsplit = NULL,
rotation = c("none", "random", "user"),
user_rotation, offset = c("none", "random", "user"),
user_offset, reassign = TRUE, min_frac = 0.025,
min_n = 5, iterate = 1,
return_factor = FALSE, repetition = 1,
seed1 = NULL) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
# Some basic argument checks:
stopifnot(is.numeric(min_frac) && length(min_frac) == 1)
stopifnot(is.numeric(min_n) && length(min_n) == 1)
stopifnot(is.numeric(iterate) && length(iterate) == 1)
stopifnot(!is.null(nsplit) | !is.null(dsplit))
if (!is.null(nsplit)) {
stopifnot(is.numeric(nsplit) && length(nsplit) <= 2)
} else {
stopifnot(is.numeric(dsplit) && length(dsplit) <= 2)
}
# Prepare rotation angles, if applicable:
rotation <- match.arg(rotation)
stopifnot(xor(rotation == "user", missing(user_rotation)))
if (rotation == "none") {
phi <- rep(0, length(repetition))
# nocov start
} else if (rotation == "random") {
phi <- runif(-45, 45, n = length(repetition))
} else if (rotation == "user") {
if (length(user_rotation) == 1) {
user_rotation <- rep(user_rotation, length(repetition))
}
stopifnot(length(user_rotation) == length(repetition))
phi <- user_rotation # nocov end
}
names(phi) <- as.character(repetition)
# This will make matrix multiplication (rotation) numerically better
# conditioned:
data[, coords[1]] <- data[, coords[1]] - mean(data[, coords[1]])
data[, coords[2]] <- data[, coords[2]] - mean(data[, coords[2]])
offset <- match.arg(offset)
stopifnot(xor(offset == "user", missing(user_offset)))
if (offset == "none") {
x_shift <- y_shift <- rep(0, length(repetition))
} else if (offset == "random") {
x_shift <- runif(0, 1, n = length(repetition))
y_shift <- runif(0, 1, n = length(repetition))
} else if (offset == "user") {
if (is.vector(user_offset) && length(user_offset) == 2) {
user_offset <- list(user_offset[1], user_offset[2])
}
stopifnot(is.list(user_offset) && length(user_offset) == 2)
# Recycle offsets as needed:
if (length(user_offset[[1]]) == 1) {
user_offset[[1]] <- rep(user_offset[[1]], length(repetition))
}
if (length(user_offset[[2]]) == 1) {
user_offset[[2]] <- rep(user_offset[[2]], length(repetition))
}
# Got enough user_offsets?
stopifnot(length(user_offset[[1]]) == length(repetition))
stopifnot(length(user_offset[[2]]) == length(repetition))
# Valid range, [0,1]?
stopifnot(min(user_offset[[1]]) >= 0 & max(user_offset[[1]]) <= 1)
stopifnot(min(user_offset[[2]]) >= 0 & max(user_offset[[2]]) <= 1)
x_shift <- user_offset[[1]]
y_shift <- user_offset[[2]]
}
names(x_shift) <- as.character(repetition)
names(y_shift) <- as.character(repetition)
if (!is.null(nsplit)) {
if (length(nsplit) == 1) {
nsplit <- c(nsplit, nsplit)
}
}
if (!is.null(dsplit)) {
# nocov start
if (length(dsplit) == 1) {
dsplit <- c(dsplit, dsplit)
} # nocov end
}
resampling <- list()
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt)
}
# Prepare the arguments and data:
if (rotation != "none") {
r <- phi[as.character(cnt)] * 180 / pi
r <- matrix(c(cos(r), -sin(r), sin(r), cos(r)), ncol = 2)
xy <- r %*% t(data[, coords])
x <- xy[1, ]
y <- xy[2, ]
} else {
x <- data[, coords[1]]
y <- data[, coords[2]]
}
x_range <- range(x)
y_range <- range(y)
if (!is.null(nsplit)) {
x_delta <- diff(x_range) / nsplit[1]
y_delta <- diff(y_range) / nsplit[2]
my_nsplit <- nsplit
} else {
# if !is.null(dsplit)
x_delta <- dsplit[1]
y_delta <- dsplit[2]
}
# Apply offsets:
if (offset != "none") {
# Widen the range and increase nsplit to allow for 'lurking' tiles:
x_range[2] <- x_range[2] + x_delta
y_range[2] <- y_range[2] + y_delta
x_range <- x_range - x_delta * (x_shift[as.character(cnt)])
y_range <- y_range - y_delta * (y_shift[as.character(cnt)])
if (is.null(dsplit)) {
my_nsplit <- my_nsplit + 1
}
}
# Calculate x and y splits:
if (is.null(dsplit)) {
x_split <- seq(x_range[1], x_range[2], length = my_nsplit[1] + 1)
y_split <- seq(y_range[1], y_range[2], length = my_nsplit[2] + 1)
} else {
x_split <- seq(x_range[1], x_range[2] + x_delta, by = x_delta)
y_split <- seq(y_range[1], y_range[2] + y_delta, by = y_delta)
my_nsplit <- c(length(x_split) - 1, length(y_split) - 1)
}
# Group data into tiles, i.e. assign tile labels to samples:
tile <- rep(NA, nrow(data))
for (ix in 1:my_nsplit[1]) {
# Intervals are normally open to the left, except the first one:
if (ix == 1) {
sel_x <- (x >= x_split[ix]) & (x <= x_split[ix + 1])
} else {
sel_x <- (x > x_split[ix]) & (x <= x_split[ix + 1])
}
for (iy in 1:my_nsplit[2]) {
if (iy == 1) {
sel_y <- (y >= y_split[iy]) & (y <= y_split[iy + 1])
} else {
sel_y <- (y > y_split[iy]) & (y <= y_split[iy + 1])
}
# Assign tile name to samples:
if (any(sel_x & sel_y)) {
tile[sel_x & sel_y] <- as.character(as.tilename(c(ix, iy))) # nolint
}
}
}
tile <- factor(tile)
# Identify and process small tiles:
s_tiles <- get_small_tiles(tile, min_n = min_n, min_frac = min_frac) # nolint
if (length(s_tiles) > 0) {
# any small tiles?
if (reassign) {
# Merge small tiles with neighbors:
ignore <- c()
# Repeat until no small tiles are left:
while ((length(s_tiles) > 0) & (length(levels(tile)) > 1)) { # nolint
# Start with smallest small tile:
nbrs <- tile_neighbors(s_tiles[1],
tileset = levels(tile),
iterate = iterate
)
if (length(nbrs) == 0) {
ignore <- c(ignore, as.character(s_tiles[1]))
} else {
# Merge tile with smallest neighbour to keep tile sizes balanced:
n_tile <- tapply(tile, tile, length)
s_nbr <- nbrs[which.min(n_tile[nbrs])]
tile[tile == s_tiles[1]] <- s_nbr
tile <- factor(as.character(tile))
}
# Update small tiles list:
s_tiles <- get_small_tiles(tile,
min_n = min_n, min_frac = min_frac,
ignore = ignore
)
}
} else {
# Just eliminate small tiles:
tile[tile %in% s_tiles] <- NA
tile <- factor(as.character(tile))
}
}
if (!return_factor) {
tile <- as.resampling(tile) # nolint
}
resampling[[as.character(cnt)]] <- tile
}
if (!return_factor) {
resampling <- as.represampling(resampling) # nolint
}
return(resampling)
}
#' @title Partition samples spatially using k-means clustering of the
#' coordinates
#'
#' @description `partition_kmeans` divides the study area into irregularly
#' shaped spatial partitions based on *k*-means ([kmeans]) clustering of
#' spatial coordinates.
#'
#' @inheritParams partition_cv
#'
#' @importFrom stats kmeans
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations.
#' @param nfold number of cross-validation folds, i.e. parameter *k* in
#' *k*-means clustering.
#' @param balancing_steps if `> 1`, perform `nfold`-means clustering
#' `balancing_steps` times, and pick the clustering that minimizes the Gini
#' index of the sample size distribution among the partitions. The idea is
#' that 'degenerate' partitions will be avoided, but this also has the side
#' effect of reducing variation among partitioning repetitions. More
#' meaningful constraints (e.g., minimum number of positive and negative
#' samples within each partition should be added in the future.
#' @param order_clusters if `TRUE`, clusters are ordered by increasing x
#' coordinate of center point.
#' @param ... additional arguments to [kmeans].
#'
#' @return A [represampling] object, see also [partition_cv] for details.
#'
#' @note Default parameter settings may change in future releases.
#'
#' @references Brenning, A., Long, S., & Fieguth, P. (2012).
#' Detecting rock glacier flow structures using Gabor filters and IKONOS
#' imagery. Remote Sensing of Environment, 125, 227-237.
#' doi:10.1016/j.rse.2012.07.005
#'
#' Russ, G. & A. Brenning. 2010a. Data mining in precision agriculture:
#' Management of spatial information. In 13th International Conference on
#' Information Processing and Management of Uncertainty,
#' IPMU 2010; Dortmund; 28 June - 2 July 2010.
#' Lecture Notes in Computer Science, 6178 LNAI: 350-359.
#'
#' @seealso [sperrorest], [partition_cv], [partition_disc], [partition_tiles],
#' [kmeans]
#'
#' @examples
#' data(ecuador)
#' resamp <- partition_kmeans(ecuador, nfold = 5, repetition = 2)
#' # plot(resamp, ecuador)
#' @export
partition_kmeans <- function(data, coords = c("x", "y"), nfold = 10,
repetition = 1, seed1 = NULL,
return_factor = FALSE, balancing_steps = 1,
order_clusters = TRUE, ...) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
balancing_steps <- max(1, balancing_steps)
resampling <- list()
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
kms <- list()
for (i in 1:balancing_steps) {
kms[[i]] <- kmeans(data[, coords], centers = nfold, ...)
}
kmgini <- function(x) {
p <- x$size / sum(x$size)
return(1 - sum(p^2))
}
km <- kms[[which.max(sapply(kms, kmgini))]]
# To do: add more meaningful selection criteria such as minimum number of
# positives and negatives in each partition ???
if (order_clusters) {
o <- rank(km$centers[, 1], ties.method = "first")
km$cluster <- o[km$cluster]
}
# The clusters are the partitions:
tile <- factor(km$cluster)
if (!return_factor) {
tile <- as.resampling(tile) # nolint
}
resampling[[as.character(cnt)]] <- tile
}
if (!return_factor) {
resampling <- as.represampling(resampling) # nolint
}
return(resampling)
}
#' @title Leave-one-disc-out cross-validation and leave-one-out cross-validation
#'
#' @description `partition_disc` partitions the sample into training and tests
#' set by selecting circular test areas (possibly surrounded by an exclusion
#' buffer) and using the remaining samples as training samples
#' (leave-one-disc-out cross-validation). `partition_loo` creates training and
#' test sets for leave-one-out cross-validation with (optional) buffer.
#'
#' @inheritParams partition_cv
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations.
#' @param radius radius of test area discs; performs leave-one-out resampling if
#' radius <0.
#' @param buffer radius of additional 'neutral area' around test area discs that
#' is excluded from training and test sets; defaults to 0, i.e. all samples
#' are either in the test area or in the training area.
#' @param ndisc Number of discs to be randomly selected; each disc constitutes a
#' separate test set. Defaults to `nrow(data)`, i.e. one disc around each
#' sample.
#' @param return_train If `FALSE`, returns only test sample; if `TRUE`, also the
#' training area.
#' @param prob optional argument to [sample].
#' @param replace optional argument to [sample]: sampling with or without
#' replacement?
#' @param repetition see `partition_cv`; however, see Note below: `repetition`
#' should normally be `= 1` in this function.
#' @param ... arguments to be passed to `partition_disc`
#'
#' @return A [represampling] object. Contains `length(repetition)` `resampling`
#' objects. Each of these contains `ndisc` lists with indices of test and (if
#' `return_train = TRUE`) training sets.
#'
#' @note Test area discs are centered at (random) samples, not at general
#' random locations. Test area discs may (and likely will) overlap independently
#' of the value of `replace`. `replace` only controls the replacement
#' of the center point of discs when drawing center points from the samples.
#'
#' `radius < 0` does leave-one-out resampling with an optional buffer.
#' `radius = 0` is similar except that samples with identical coordinates
#' would fall within the test area disc.
#'
#' @references Brenning, A. 2005. Spatial prediction models for landslide
#' hazards: review, comparison and evaluation. Natural Hazards and Earth System
#' Sciences, 5(6): 853-862.
#'
#' @seealso [sperrorest], [partition_cv], [partition_kmeans]
#' @name partition_disc
#'
#' @examples
#' data(ecuador)
#' parti <- partition_disc(ecuador,
#' radius = 200, buffer = 200,
#' ndisc = 5, repetition = 1:2
#' )
#' # plot(parti,ecuador)
#' summary(parti)
#'
#' # leave-one-out with buffer:
#' parti.loo <- partition_loo(ecuador, buffer = 200)
#' summary(parti)
#' @export
partition_disc <- function(data,
coords = c("x", "y"),
radius,
buffer = 0,
ndisc = nrow(data),
seed1 = NULL,
return_train = TRUE,
prob = NULL,
replace = FALSE,
repetition = 1) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
posbuf <- buffer
stopifnot(buffer >= 0)
if (replace == FALSE & ndisc > nrow(data)) {
stop("partition_disc: ndisc must be <= nrow(data) if replace=FALSE") # nocov
}
resample <- list()
# Loop for repetitions:
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
if (ndisc == nrow(data)) {
index <- seq_len(nrow(data)) # nocov
} else {
index <- sample.int(nrow(data),
size = ndisc, replace = replace,
prob = prob
)
}
res <- list()
for (i in index) {
if (!is.null(buffer) | radius >= 0) {
di <- sqrt((data[, coords[1]] - data[i, coords[1]])^2 + # nolint
(data[, coords[2]] - data[i, coords[2]])^2) # nolint
}
train_sel <- numeric()
if (radius >= 0) {
# leave-disc-out with buffer:
test_sel <- which(di <= radius)
if (return_train) {
train_sel <- which(di > (radius + posbuf))
}
} else {
# leave-one-out with buffer:
test_sel <- i
if (return_train) {
if (is.null(buffer)) {
train_sel <- seq_len(nrow(data))[-i] # nocov
} else {
train_sel <- which(di > posbuf)
}
}
}
if (return_train & (length(train_sel) == 0)) {
warning(paste0(
"empty training set in 'partition_disc': 'buffer'", # nocov #nolint
" and/or 'radius' too large?"
)) # nocov
}
res[[as.character(i)]] <- list(train = train_sel, test = test_sel)
}
resample[[as.character(cnt)]] <- res
}
repres <- as.represampling(resample) # nolint
return(repres)
}
#' @rdname partition_disc
#'
#' @name partition_loo
#'
#' @export
partition_loo <- function(data, ndisc = nrow(data), replace = FALSE, ...) {
partition_disc(
data = data, radius = -1, ndisc = ndisc,
replace = replace, ...
)
}
#' @title Non-spatial bootstrap resampling
#'
#' @description `represampling_bootstrap` draws a bootstrap random sample (with
#' replacement) from `data`.
#'
#' @inheritParams partition_cv
#'
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations.
#' @param nboot Size of bootstrap sample
#' @param oob logical (default `FALSE`): if `TRUE`, use the out-of-bag sample as
#' the test sample; if `FALSE`, draw a second bootstrap sample of size `nboot`
#' independently to obtain a test sample.
#'
#' @return A [represampling] object. This is a (named) list containing
#' `length(repetition)`. [resampling] objects. Each of these contains only one
#' list with indices of `train`ing and `test` samples. Indices are row indices
#' for `data`.
#'
#' @examples
#' data(ecuador)
#' # only 10 bootstrap repetitions, normally use >=100:
#' parti <- represampling_bootstrap(ecuador, repetition = 10)
#' # plot(parti, ecuador) # careful: overplotting occurs
#' # because some samples are included in both the training and
#' # the test sample (possibly even multiple times)
#' @export
represampling_bootstrap <- function(data, coords = c("x", "y"),
nboot = nrow(data),
repetition = 1, seed1 = NULL, oob = FALSE) {
resample <- list()
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
# Bootstrap training sample, drawn with replacement:
train <- sample(nrow(data), nboot, replace = TRUE)
if (oob) {
# test set = out of bag sample:
test <- c(seq_len(nrow(data)))[!(c(seq_len(nrow(data))) %in% train)] # nocov
} else {
# test set = independently drawn bootstrap sample
test <- sample(nrow(data), nboot, replace = TRUE)
}
resample[[as.character(cnt)]] <- list(`1` = list(
train = train,
test = test
))
}
return(as.represampling(resample)) # nolint
}
#' @title Bootstrap at an aggregated level
#'
#' @description `represampling_factor_bootstrap` resamples partitions defined by
#' a factor variable. This can be used for non-overlapping block bootstraps
#' and similar.
#'
#' @inheritParams represampling_bootstrap
#'
#' @param fac defines a grouping or partitioning of the samples in `data`; three
#' possible types: (1) the name of a variable in `data` (coerced to factor if
#' not already a factor variable); (2) a factor variable (or a vector that can
#' be coerced to factor); (3) a list of factor variables (or vectors that can
#' be coerced to factor); this list must be of length `length(repetition)`,
#' and if it is named, the names must be equal to `as.character(repetition)`;
#' this list will typically be generated by a `partition.*` function with
#' `return_factor = TRUE` (see Examples below)
#'
#' @param nboot number of bootstrap replications used for generating the
#' bootstrap training sample (`nboot[1]`) and the test sample (`nboot[2]`);
#' `nboot[2]` is ignored (with a warning) if `oob = TRUE`. A value of `-1`
#' will be substituted with the number of levels of the factor variable,
#' corresponding to an *n* out of *n* bootstrap at the grouping level defined
#' by `fac`.
#' @param oob if `TRUE`, the test sample will be the out-of-bag sample; if
#' `FALSE` (default), the test sample is an independently drawn bootstrap
#' sample of size `nboot[2]`.
#' @details `nboot` refers to the number of groups (as defined by the factors)
#' to be drawn with replacement from the set of groups. I.e., if `fac` is a
#' factor variable, `nboot` would normally not be greater than `nlevels(fac)`,
#' `nlevels(fac)` being the default as per `nboot = -1`.
#'
#' @seealso [represampling_disc_bootstrap],
#' [represampling_tile_bootstrap]
#'
#' @examples
#' data(ecuador)
#' # a dummy example for demonstration, performing bootstrap
#' # at the level of an arbitrary factor variable:
#' parti <- represampling_factor_bootstrap(ecuador,
#' factor(floor(ecuador$dem / 100)),
#' oob = TRUE
#' )
#' # plot(parti,ecuador)
#' # using the factor bootstrap for a non-overlapping block bootstrap
#' # (see also represampling_tile_bootstrap):
#' fac <- partition_tiles(ecuador,
#' return_factor = TRUE, repetition = c(1:3),
#' dsplit = 500, min_n = 200, rotation = "random",
#' offset = "random"
#' )
#' parti <- represampling_factor_bootstrap(ecuador, fac,
#' oob = TRUE,
#' repetition = c(1:3)
#' )
#' # plot(parti, ecuador)
#' @export
represampling_factor_bootstrap <- function(data, fac, repetition = 1,
nboot = -1,
seed1 = NULL, oob = FALSE) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
if (oob && length(nboot) > 1) {
warning("nboot[2] ignored because 'oob = TRUE'") # nocov
}
if (is.list(fac)) {
stopifnot(length(fac) == length(repetition))
if (is.null(names(fac))) {
names(fac) <- as.character(repetition) # nocov
} else {
stopifnot(all(as.character(repetition) %in% names(fac)))
}
} else {
if (length(fac) == 1 && is.character(fac)) {
fac <- data[, fac] # nocov
} else {
stopifnot(length(fac) == nrow(data))
}
fac <- factor(fac)
}
if (length(nboot) == 1) {
nboot <- rep(nboot, 2)
}
resample <- list()
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
# what factor variable to resample?:
if (is.list(fac)) {
the_fac <- factor(fac[[as.character(cnt)]])
} else {
the_fac <- fac
}
# How many bootstrap samples (at the factor level)?:
the_nboot <- nboot
if (the_nboot[1] < 0) {
the_nboot[1] <- nlevels(the_fac)
}
if (the_nboot[2] < 0) {
the_nboot[2] <- nlevels(the_fac)
}
# Factor levels to be used in training sample:
train <- sample(levels(the_fac), the_nboot[1], replace = TRUE)
# Factor levels to be used for test sample: out-of-bag, i.e. factors that
# are not used for the training sample:
if (oob) {
test <- levels(the_fac)[!(levels(the_fac) %in% train)]
} else {
# second, independently drawn bootstrap sample at the factor level:
test <- sample(levels(the_fac), the_nboot[2], replace = TRUE)
}
# Turn factor levels into indices:
train <- unlist(sapply(train, function(x) which(the_fac != x)),
use.names = FALSE
)
test <- unlist(sapply(test, function(x) which(the_fac == x)),
use.names = FALSE
)
# Compile training and test indices into a resampling object:
resample[[as.character(cnt)]] <- as.resampling(list(
`1` =
list(
train = train,
test = test
)
))
}
resample <- as.represampling(resample) # nolint
return(resample)
}
#' @title Spatial block bootstrap using rectangular blocks
#'
#' @description `represampling_tile_bootstrap` performs a non-overlapping
#' spatial block bootstrap by resampling at the level of rectangular
#' partitions or 'tiles' generated by [partition_tiles].
#'
#' @inheritParams represampling_bootstrap
#'
#' @param nboot see [represampling_factor_bootstrap]
#' @param oob see [represampling_factor_bootstrap]
#' @param ... additional arguments to be passed to [partition_tiles]
#'
#' @export
represampling_tile_bootstrap <- function(data,
coords = c("x", "y"),
repetition = 1,
nboot = -1,
seed1 = NULL,
oob = FALSE,
...) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
parti <- partition_tiles(
data = data, coords = coords,
repetition = repetition,
seed1 = seed1, return_factor = TRUE, ...
)
repres <- represampling_factor_bootstrap(
data = data, fac = parti,
repetition = repetition,
seed1 = seed1, nboot = nboot,
oob = oob
)
return(repres)
}
#' @title Spatial block bootstrap at the level of spatial k-means clusters
#'
#' @description `represampling_kmeans_bootstrap` performs a non-overlapping
#' spatial block bootstrap by resampling at the level of irregularly-shaped
#' partitions generated by [partition_kmeans].
#'
#' @inheritParams represampling_bootstrap
#'
#' @param nfold see [partition_kmeans]
#' @param nboot see [represampling_factor_bootstrap]
#' @param oob see [represampling_factor_bootstrap]
#' @param ... additional arguments to be passed to [partition_kmeans]
#' @keywords internal
#' @export
represampling_kmeans_bootstrap <- function(data,
coords = c("x", "y"),
repetition = 1,
nfold = 10,
nboot = nfold,
seed1 = NULL,
oob = FALSE,
...) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
# FIXME: Why is partition_tiles used here?
# See also https://github.com/giscience-fsu/sperrorest/issues/50
# hidden to the user for now
parti <- partition_tiles(
data = data,
coords = coords,
repetition = repetition,
seed1 = seed1,
return_factor = TRUE,
...
)
repres <- represampling_factor_bootstrap(
data = data,
fac = parti,
repetition = repetition,
seed1 = seed1,
nboot = nboot,
oob = oob
)
return(repres)
}
#' @title Overlapping spatial block bootstrap using circular blocks
#'
#' @description `represampling_disc_bootstrap` performs a spatial block
#' bootstrap by resampling at the level of rectangular partitions or 'tiles'
#' generated by `partition_tiles`.
#'
#' @inheritParams represampling_bootstrap
#'
#' @param oob logical (default `FALSE`): if `TRUE`, use the out-of-bag sample as
#' the test sample (the complement of the `nboot[1]` test set discs, minus the
#' buffer area as specified in the `...` arguments to [partition_disc]); if
#' `FALSE`, draw a second bootstrap sample of size `nboot` independently to
#' obtain a test sample (sets of overlapping discs drawn with replacement).
#' @param nboot number of bootstrap samples; you may specify different values
#' for the training sample (`nboot[1]`) and for the test sample (`nboot[2]`).
#' @param ... additional arguments to be passed to [partition_disc]; note that a
#' `buffer` argument has not effect if `oob=FALSE`; see example below
#'
#' @note Performs `nboot` out of `nrow(data)` resampling of circular discs. This
#' is an *overlapping* spatial block bootstrap where the blocks are circular.
#'
#' @examples
#' data(ecuador)
#' # Overlapping disc bootstrap:
#' parti <- represampling_disc_bootstrap(ecuador,
#' radius = 200, nboot = 20,
#' oob = FALSE
#' )
#' # plot(parti, ecuador)
#' # Note that a 'buffer' argument would make no difference because boostrap
#' # sets of discs are drawn independently for the training and test sample.
#' #
#' # Overlapping disc bootstrap for training sample, out-of-bag sample as test
#' # sample:
#' parti <- represampling_disc_bootstrap(ecuador,
#' radius = 200, buffer = 200,
#' nboot = 10, oob = TRUE
#' )
#' # plot(parti,ecuador)
#' @export
represampling_disc_bootstrap <- function(data,
coords = c("x", "y"),
nboot,
repetition = 1,
seed1 = NULL,
oob = FALSE, ...) {
if (length(repetition) < 2) {
repetition <- seq(1, repetition, by = 1)
}
if (oob && length(nboot) > 1) {
warning("nboot[2] ignored because oob = TRUE") # nocov
}
if (length(nboot) == 1) {
nboot <- c(nboot, nboot)
}
# nocov start
if (oob) {
resample <- list()
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt)
}
train <- partition_disc(
data = data, coords = coords,
repetition = c(1:nboot[1]),
replace = TRUE, ndisc = 1, seed1 = NULL,
return_train = TRUE, ...
)
test <- c(seq_len(nrow(data)))
for (i in 1:nboot[1]) {
test <- test[test %in% train[[i]][[1]]$train] # yes, $train!
train[[i]][[1]]$train <- NULL
}
if (length(test) == 0) {
warning("empty test set in 'partition_disc.bootstrap':\n'buffer'
and/or 'radius' and/or 'nboot' too large?")
}
train <- unname(unlist(train))
resample[[as.character(cnt)]] <- list(`1` = list(
train = train,
test = test
))
} # nocov end
} else {
resample <- list()
for (cnt in repetition) {
if (!is.null(seed1)) {
set.seed(seed1 + cnt) # nocov
}
train <- partition_disc(
data = data, coords = coords,
repetition = c(1:nboot[1]),
seed1 = NULL, replace = TRUE, ndisc = 1,
return_train = FALSE, ...
)
train <- unname(unlist(train))
test <- partition_disc(
data = data, coords = coords,
repetition = c(1:nboot[2]),
seed1 = NULL, replace = TRUE, ndisc = 1,
return_train = FALSE, ...
)
test <- unname(unlist(test))
resample[[as.character(cnt)]] <- list(`1` = list(
train = train,
test = test
))
}
}
resample <- as.represampling(resample) # nolint
return(resample)
}
#' @title Plot spatial resampling objects
#'
#' @description `plot.represampling` displays the partitions or samples
#' corresponding arising from the resampling of a data set.
#'
#' @method plot represampling
#'
#' @param x a [represampling] resp. [resampling] object.
#' @param data a `data.frame` of samples containing at least the x and y
#' coordinates of samples as specified by `coords`.
#' @param coords vector of length 2 defining the variables in `data` that
#' contain the x and y coordinates of sample locations.
#' @param pch point symbol (to be passed to [points]).
#' @param wiggle_sd 'wiggle' the point locations in x and y direction to avoid
#' overplotting of samples drawn multiple times by bootstrap methods; this is
#' a standard deviation (in the units of the x/y coordinates) of a normal
#' distribution and defaults to 0 (no wiggling).
#' @param ... additional arguments to [plot].
#'
#' @note This function is not intended for samples obtained by resampling with
#' replacement (e.g., bootstrap) because training and test points will be
#' overplotted in that case. The size of the plotting region will also limit
#' the number of maps that can be displayed at once, i.e., the number of rows
#' (repetitions) and fields (columns).
#'
#' @examples
#' data(ecuador)
#' # non-spatial cross-validation:
#' resamp <- partition_cv(ecuador, nfold = 5, repetition = 1:2)
#' # plot(resamp, ecuador)
#' # spatial cross-validation using k-means clustering:
#' resamp <- partition_kmeans(ecuador, nfold = 5, repetition = 1:2)
#' # plot(resamp, ecuador)
#' @name plot.represampling
#' @export
plot.represampling <- function(x, data, coords = c("x", "y"), pch = "+",
wiggle_sd = 0, ...) {
# nocov start
if (missing(data)) {
stop("'data' argument missing")
}
stopifnot(wiggle_sd >= 0)
resample <- x
nr <- length(resample)
nc <- max(sapply(resample, length))
if (nr > 5) {
warning("Probably too many repetitions length(x) to be able to\n
plot represampling object x. Trying anyway...")
}
if (nr > 7) {
warning("Probably too many folds max(sapply(x,length)) to\n
be able to plot represampling object x.
Trying anyway...")
}
op <- par(no.readonly = TRUE)
par(
mfrow = c(nr, nc), mar = c(2, 2, 3, 0.5), mgp = c(2, 0.7, 0), tcl = -0.3,
cex = 0.5
)
for (i in seq_along(resample)) {
for (j in seq_along(resample[[i]])) {
seltrain <- resample[[i]][[j]]$train
seltest <- resample[[i]][[j]]$test
main <- paste("Repetition ", names(resample)[i], ", Fold ", j)
plot(data[, coords[1]], data[, coords[2]],
pch = ".", type = "n",
main = main,
xlab = "", ylab = "", ...
) # xlab=coords[1], ylab=coords[2])
wxtrain <- rnorm(length(seltrain), sd = wiggle_sd)
wytrain <- rnorm(length(seltrain), sd = wiggle_sd)
wxtest <- rnorm(length(seltest), sd = wiggle_sd)
points(data[seltrain, coords[1]] + wxtrain, data[seltrain, coords[2]] +
wytrain, pch = pch)
points(data[seltest, coords[1]] + wxtest, data[seltest, coords[2]] +
wxtest, pch = pch, col = "red")
}
}
par(op)
}
#' @rdname plot.represampling
#' @name plot.resampling
#' @method plot resampling
plot.resampling <- function(x, ...) {
x <- as.represampling(list(`1` = x)) # nolint
plot.represampling(x) # nolint # nocov end
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.