R/perrySplits.R

Defines functions getIndices.bootSamples getIndices.randomSplits getIndices.cvFolds getIndices bootSamples randomSplits cvFolds perrySplits.bootControl perrySplits.splitControl perrySplits.foldControl perrySplits bootControl splitControl foldControl

Documented in bootControl bootSamples cvFolds foldControl perrySplits randomSplits splitControl

# --------------------------------------
# Author: Andreas Alfons
#         Erasmus Universiteit Rotterdam
# --------------------------------------

#' Control object for cross-validation folds
#'
#' Generate an object that controls how to split \eqn{n} observations or
#' groups of observations into \eqn{K} folds to be used for (repeated)
#' \eqn{K}-fold cross-validation.  \eqn{K} should thereby be chosen such that
#' all folds are of approximately equal size.
#'
#' @param K  an integer giving the number of folds into which the observations
#' should be split (the default is five).
#' @param R  an integer giving the number of replications for repeated
#' \eqn{K}-fold cross-validation.
#' @param type  a character string specifying the type of folds to be
#' generated.  Possible values are \code{"random"} (the default),
#' \code{"consecutive"} or \code{"interleaved"}.
#' @param grouping  a factor specifying groups of observations.
#'
#' @return An object of class \code{"foldControl"} with the following
#' components:
#' \describe{
#'   \item{\code{K}}{an integer giving the number of folds.  A value of
#'   \code{K} equal to the number of observations or groups yields
#'   eave-one-out cross-validation.}
#'   \item{\code{R}}{an integer giving the number of replications.  This will
#'   be ignored for for leave-one-out cross-validation and other non-random
#'   splits of the data.}
#'   \item{\code{type}}{a character string specifying the type of folds.}
#'   \item{\code{grouping}}{if supplied, a factor specifying groups of
#'   observations.  The data will then be split according to the groups rather
#'   than individual observations such that all observations within a group
#'   belong to the same fold.}
#' }
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{perrySplits}}, \code{\link{cvFolds}},
#' \code{\link{splitControl}}, \code{\link{bootControl}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#' perrySplits(20, foldControl(K = 5))
#' perrySplits(20, foldControl(K = 5, R = 10))
#'
#' @keywords utilities
#'
#' @export

foldControl <- function(K = 5, R = 1,
        type = c("random", "consecutive", "interleaved"),
        grouping = NULL) {
    # check arguments
    K <- round(rep(K, length.out=1))
    if(!isTRUE(K > 1)) stop("'K' must be larger than 1")
    R <- round(rep(R, length.out=1))
    if(!isTRUE(R > 0)) R <- formals()$R  # use default value
    type <- match.arg(type)
    if(!is.null(grouping)) grouping <- as.factor(grouping)
    # construct control object
    control <- list(K=K, R=R, type=type, grouping=grouping)
    class(control) <- "foldControl"
    control
}


#' Control object for random data splits
#'
#' Generate an object that controls how to split \eqn{n} observations or
#' groups of observations into training and test data to be used for (repeated)
#' random splitting (also known as random subsampling or Monte Carlo
#' cross-validation).
#'
#' @param m  an integer giving the number of observations or groups of
#' observations to be used as test data.
#' @param R  an integer giving the number of random data splits.
#' @param grouping  a factor specifying groups of observations.
#'
#' @return An object of class \code{"splitControl"} with the following
#' components:
#' \describe{
#'   \item{\code{m}}{an integer giving the number of observations or groups of
#'   observations to be used as test data.}
#'   \item{\code{R}}{an integer giving the number of random data splits.}
#'   \item{\code{grouping}}{if supplied, a factor specifying groups of
#'   observations.  The data will then be split according to the groups rather
#'   than individual observations such that all observations within a group
#'   belong either to the training or test data.}
#' }
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{perrySplits}}, \code{\link{randomSplits}},
#' \code{\link{foldControl}}, \code{\link{bootControl}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#' perrySplits(20, splitControl(m = 5))
#' perrySplits(20, splitControl(m = 5, R = 10))
#'
#' @keywords utilities
#'
#' @export

splitControl <- function(m, R = 1, grouping = NULL) {
    # check arguments
    m <- round(rep(m, length.out=1))
    if(!isTRUE(m > 0)) stop("'m' must be positive")
    R <- round(rep(R, length.out=1))
    if(!isTRUE(R > 0)) R <- formals()$R  # use default value
    if(!is.null(grouping)) grouping <- as.factor(grouping)
    # construct control object
    control <- list(m=m, R=R, grouping=grouping)
    class(control) <- "splitControl"
    control
}


#' Control object for bootstrap samples
#'
#' Generate an object that controls how to draw bootstrap samples and which
#' bootstrap estimator of prediction error to compute.
#'
#' @param R  an integer giving the number of bootstrap samples.
#' @param type  a character string specifying a bootstrap estimator.  Possible
#' values are \code{"0.632"} (the default), or \code{"out-of-bag"}.
#' @param grouping  a factor specifying groups of observations.
#'
#' @return An object of class \code{"bootControl"} with the following
#' components:
#' \describe{
#'   \item{\code{R}}{an integer giving the number of bootstrap samples.}
#'   \item{\code{type}}{a character string specifying the type of bootstrap
#'   estimator.}
#'   \item{\code{grouping}}{if supplied, a factor specifying groups of
#'   observations.  The groups will then be resampled rather than individual
#'   observations such that all observations within a group belong either to the
#'   bootstrap sample or the test data.}
#' }
#'
#' @author Andreas Alfons
#'
#' @references
#' Efron, B. (1983) Estimating the error rate of a prediction rule: improvement
#' on cross-validation.  \emph{Journal of the American Statistical
#' Association}, \bold{78}(382), 316--331.
#'
#' @seealso \code{\link{perrySplits}}, \code{\link{bootSamples}},
#' \code{\link{foldControl}}, \code{\link{splitControl}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#' perrySplits(20, bootControl())
#' perrySplits(20, bootControl(R = 10))
#'
#' @keywords utilities
#'
#' @export

bootControl <- function(R = 1, type = c("0.632", "out-of-bag"),
        grouping = NULL) {
    # check arguments
    R <- round(rep(R, length.out=1))
    if(!isTRUE(R > 0)) R <- formals()$R  # use default value
    type <- match.arg(type)
    if(!is.null(grouping)) grouping <- as.factor(grouping)
    # construct control object
    control <- list(R=R, type=type, grouping=grouping)
    class(control) <- "bootControl"
    control
}


#' Data splits for resampling-based prediction error measures
#'
#' Split observations or groups of observations into segments to be used
#' for (repeated) \eqn{K}-fold cross-validation, (repeated) random splitting
#' (also known as random subsampling or Monte Carlo cross-validation), or the
#' bootstrap.
#'
#' @param n  an integer giving the number of observations to be split.
#' @param control  a control object of class \code{"foldControl"} (as generated
#' by \code{\link{foldControl}}), \code{"splitControl"} (as generated by
#' \code{\link{splitControl}}) or \code{"bootControl"} (as generated by
#' \code{\link{bootControl}}).
#'
#' @return
#' For the \code{foldControl} method, an object of class \code{"cvFolds"}
#' giving folds for (repeated) \eqn{K}-fold cross-validation (see
#' \code{\link{cvFolds}}).
#'
#' For the \code{splitControl} method, an object of class \code{"randomSplits"}
#' giving random data splits (see \code{\link{randomSplits}}).
#'
#' For the \code{bootControl} method, an object of class \code{"bootSamples"}
#' giving bootstrap samples (see \code{\link{bootSamples}}).
#'
#' @note Users may prefer the wrapper functions \code{\link{cvFolds}},
#' \code{\link{randomSplits}} and \code{\link{bootSamples}}.
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{foldControl}}, \code{\link{splitControl}},
#' \code{\link{bootControl}}, \code{\link{cvFolds}},
#' \code{\link{randomSplits}}, \code{\link{bootSamples}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#'
#' ## data folds for K-fold cross-validation
#' perrySplits(20, foldControl(K = 5))
#' perrySplits(20, foldControl(K = 5, R = 10))
#'
#' ## random data splits
#' perrySplits(20, splitControl(m = 5))
#' perrySplits(20, splitControl(m = 5, R = 10))
#'
#' ## bootstrap samples
#' perrySplits(20, bootControl())
#' perrySplits(20, bootControl(R = 10))
#'
#' @keywords utilities
#'
#' @export

perrySplits <- function(n, control) UseMethod("perrySplits", control)

#' @export
perrySplits.foldControl <- function(n, control) {
    # initializations
    K <- control$K
    R <- control$R
    type <- control$type
    grouping <- control$grouping
    # check arguments
    n <- if(is.null(grouping)) round(rep(n, length.out=1)) else nlevels(grouping)
    if(!isTRUE(n > 0)) stop("'n' must be positive")
    if(!isTRUE(K <= n)) stop(sprintf("'K' must be smaller or equal to %d", n))
    if(K == n) type <- "leave-one-out"
    # obtain CV folds
    if(type == "random") {
        # random K-fold splits with R replications
        subsets <- replicate(R, sample.int(n))
    } else {
        # leave-one-out CV or non-random splits, replication not meaningful
        R <- 1
        subsets <- as.matrix(seq_len(n))
    }
    which <- as.factor(rep(seq_len(K), length.out=n))
    if(type == "consecutive") which <- rep.int(seq_len(K), tabulate(which))
    # construct and return object
    folds <- list(n=n, K=K, R=R, subsets=subsets, which=which)
    if(!is.null(grouping))
        folds$grouping <- split(seq_along(grouping), grouping)
    class(folds) <- "cvFolds"
    folds
}

#' @export
perrySplits.splitControl <- function(n, control) {
    # initializations
    m <- control$m
    R <- control$R
    grouping <- control$grouping
    # check arguments
    n <- if(is.null(grouping)) round(rep(n, length.out=1)) else nlevels(grouping)
    if(!isTRUE(n > 0)) stop("'n' must be positive")
    if(!isTRUE(m < n)) stop(sprintf("'m' must be smaller than %d", n))
    # random splits with R replications
    subsets <- replicate(R, sample.int(n, m))
    # construct and return object
    splits <- list(n=n, m=m, R=R, subsets=subsets)
    if(!is.null(grouping))
        splits$grouping <- split(seq_along(grouping), grouping)
    class(splits) <- "randomSplits"
    splits
}

#' @export
perrySplits.bootControl <- function(n, control) {
    # initializations
    R <- control$R
    type <- control$type
    grouping <- control$grouping
    # check arguments
    n <- if(is.null(grouping)) round(rep(n, length.out=1)) else nlevels(grouping)
    if(!isTRUE(n > 0)) stop("'n' must be positive")
    # random splits with R replications
    samples <- replicate(R, sample.int(n, replace=TRUE))
    # drop subsets that contain all the observations and draw new subsets until
    # there are R subsets with out-of-bag-observations
    replace <- whichAllInBag(n, samples)
    newR <- length(replace)
    while(newR > 0) {
        newSamples <- replicate(newR, sample.int(n, replace=TRUE))
        samples[, replace] <- newSamples
        replace <- replace[whichAllInBag(n, newSamples)]
        newR <- length(replace)
    }
    # construct and return object
    splits <- list(n=n, R=R, type=type, samples=samples)
    if(!is.null(grouping))
        splits$grouping <- split(seq_along(grouping), grouping)
    splits$yHat <- control$yHat  # passed internally for 0.632 estimator
    class(splits) <- "bootSamples"
    splits
}


#' Cross-validation folds
#'
#' Split observations or groups of observations into \eqn{K} folds to be used
#' for (repeated) \eqn{K}-fold cross-validation.  \eqn{K} should thereby be
#' chosen such that all folds are of approximately equal size.
#'
#' @aliases print.cvFolds
#'
#' @param n  an integer giving the number of observations to be split into
#' folds.  This is ignored if \code{grouping} is supplied in order to split
#' groups of observations into folds.
#' @param K  an integer giving the number of folds into which the observations
#' should be split (the default is five).  Setting \code{K} equal to the number
#' of observations or groups yields leave-one-out cross-validation.
#' @param R  an integer giving the number of replications for repeated
#' \eqn{K}-fold cross-validation.  This is ignored for for leave-one-out
#' cross-validation and other non-random splits of the data.
#' @param type  a character string specifying the type of folds to be
#' generated.  Possible values are \code{"random"} (the default),
#' \code{"consecutive"} or \code{"interleaved"}.
#' @param grouping  a factor specifying groups of observations.  If supplied,
#' the data are split according to the groups rather than individual
#' observations such that all observations within a group belong to the same
#' fold.
#'
#' @return An object of class \code{"cvFolds"} with the following components:
#' \describe{
#'   \item{\code{n}}{an integer giving the number of observations or groups.}
#'   \item{\code{K}}{an integer giving the number of folds.}
#'   \item{\code{R}}{an integer giving the number of replications.}
#'   \item{\code{subsets}}{an integer matrix in which each column contains a
#'   permutation of the indices of the observations or groups.}
#'   \item{\code{which}}{an integer vector giving the fold for each permuted
#'   observation or group.}
#'   \item{\code{grouping}}{a list giving the indices of the observations
#'   belonging to each group.  This is only returned if a grouping factor
#'   has been supplied.}
#' }
#'
#' @note This is a simple wrapper function for \code{\link{perrySplits}} with a
#' control object generated by \code{\link{foldControl}}.
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{perrySplits}}, \code{\link{foldControl}},
#' \code{\link{randomSplits}}, \code{\link{bootSamples}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#' cvFolds(20, K = 5, type = "random")
#' cvFolds(20, K = 5, type = "consecutive")
#' cvFolds(20, K = 5, type = "interleaved")
#' cvFolds(20, K = 5, R = 10)
#'
#' @keywords utilities
#'
#' @export

cvFolds <- function(n, K = 5, R = 1,
        type = c("random", "consecutive", "interleaved"),
        grouping = NULL) {
    # construct control object and call perrySplits()
    perrySplits(n, foldControl(K=K, R=R, type=type, grouping=grouping))
}


#' Random data splits
#'
#' Split observations or groups of observations into training and test data to
#' be used for (repeated) random splitting (also known as random subsampling or
#' Monte Carlo cross-validation).
#'
#' @aliases print.randomSplits
#'
#' @param n  an integer giving the number of observations to be split into
#' training and test data.  This is ignored if \code{grouping} is supplied in
#' order to split groups of observations into folds.
#' @param m  an integer giving the number of observations or groups of
#' observations to be used as test data.
#' @param R  an integer giving the number of random data splits.
#' @param grouping  a factor specifying groups of observations.  If supplied,
#' the data are split according to the groups rather than individual
#' observations such that all observations within a group belong either to the
#' training or test data.
#'
#' @return An object of class \code{"randomSplits"} with the following
#' components:
#' \describe{
#'   \item{\code{n}}{an integer giving the number of observations or groups.}
#'   \item{\code{m}}{an integer giving the number of observations or groups in
#'   the test data.}
#'   \item{\code{R}}{an integer giving the number of random data splits.}
#'   \item{\code{subsets}}{an integer matrix in which each column contains
#'   the indices of the observations or groups in the test data of the
#'   corresponding random data split.}
#'   \item{\code{grouping}}{a list giving the indices of the observations
#'   belonging to each group.  This is only returned if a grouping factor
#'   has been supplied.}
#' }
#'
#' @note This is a simple wrapper function for \code{\link{perrySplits}} with a
#' control object generated by \code{\link{splitControl}}.
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{perrySplits}}, \code{\link{splitControl}},
#' \code{\link{cvFolds}}, \code{\link{bootSamples}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#' randomSplits(20, m = 5)
#' randomSplits(20, m = 5, R = 10)
#'
#' @keywords utilities
#'
#' @export

randomSplits <- function(n, m, R = 1, grouping = NULL) {
    # construct control object and call perrySplits()
    perrySplits(n, splitControl(m=m, R=R, grouping=grouping))
}


#' Bootstrap samples
#'
#' Draw bootstrap samples of observations or groups of observations and specify
#' which bootstrap estimator of prediction error to compute.
#'
#' @aliases print.bootSamples
#'
#' @param n  an integer giving the number of observations for which to draw
#' bootstrap samples.  This is ignored if \code{grouping} is supplied in
#' order to respect the group structure of the data in the bootstrap samples.
#' @param R  an integer giving the number of bootstrap samples.
#' @param type  a character string specifying a bootstrap estimator.  Possible
#' values are \code{"0.632"} (the default), or \code{"out-of-bag"}.
#' @param grouping  a factor specifying groups of observations.  If supplied,
#' the groups are resampled rather than individual observations such that all
#' observations within a group belong either to the bootstrap sample or the
#' test data.
#'
#' @return An object of class \code{"bootSamples"} with the following
#' components:
#' \describe{
#'   \item{\code{n}}{an integer giving the number of observations or groups.}
#'   \item{\code{R}}{an integer giving the number of bootstrap samples.}
#'   \item{\code{subsets}}{an integer matrix in which each column contains the
#'   indices of the observations or groups in the corresponding bootstrap
#'   sample.}
#'   \item{\code{grouping}}{a list giving the indices of the observations
#'   belonging to each group.  This is only returned if a grouping factor
#'   has been supplied.}
#' }
#'
#' @note This is a simple wrapper function for \code{\link{perrySplits}} with a
#' control object generated by \code{\link{bootControl}}.
#'
#' @author Andreas Alfons
#'
#' @references
#' Efron, B. (1983) Estimating the error rate of a prediction rule: improvement
#' on cross-validation.  \emph{Journal of the American Statistical
#' Association}, \bold{78}(382), 316--331.
#'
#' @seealso \code{\link{perrySplits}}, \code{\link{bootControl}},
#' \code{\link{cvFolds}}, \code{\link{randomSplits}}
#'
#' @examples
#' set.seed(1234)  # set seed for reproducibility
#' bootSamples(20)
#' bootSamples(20, R = 10)
#'
#' @keywords utilities
#'
#' @export

bootSamples <- function(n, R = 1, type = c("0.632", "out-of-bag"),
        grouping = NULL) {
    # construct control object and call perrySplits()
    perrySplits(n, bootControl(R=R, type=type, grouping=grouping))
}


## retrieve indices for r-th replication
getIndices <- function(x, ...) UseMethod("getIndices")

getIndices.cvFolds <- function(x, r = 1, ...) {
    # split permuted items according to the folds
    subsets <- split(x$subsets[, r], x$which)
    # in case of grouped data, the list contains the group indices in each CV
    # fold, so the indices of the respective observations need to be extracted
    if(!is.null(grouping <- x$grouping))
        subsets <- lapply(subsets, function(s) unlist(grouping[s], use.names=FALSE))
    # return list of indices for CV folds
    names(subsets) <- NULL
    subsets
}

getIndices.randomSplits <- function(x, r = 1, ...) {
    subsets <- x$subsets[, r]
    # in case of grouped data, the matrix contains the indices of the groups in
    # the test data, so the indices of the respective observations need to be
    # extracted
    if(!is.null(grouping <- x$grouping))
        subsets <- unlist(grouping[subsets], use.names=FALSE)
    # return matrix of indices for test data
    subsets
}

getIndices.bootSamples <- function(x, r = 1, ...) {
    samples <- x$samples[, r]
    # in case of grouped data, the matrix contains the indices of the groups in
    # the bootstrap samples, so the indices of the respective observations need
    # to be extracted
    if(!is.null(grouping <- x$grouping))
        samples <- unlist(grouping[samples], use.names=FALSE)
    # return matrix of indices for bootstrap samples
    samples
}

Try the perry package in your browser

Any scripts or data that you put into this service are public.

perry documentation built on Nov. 3, 2021, 5:08 p.m.