#' balance a data set according to some grouping factor(s)
#'
#' @param xdata a \code{data.frame}
#' @param whattobalance a character vector with column names. The corresponding columns typically are either factor or character.
#' @param n integer, the number of cases to select for each factor level (or combination of factor levels)
#' @details the function requires either one or two factors to be balanced over
#'
#' if \code{n} is larger than the largest possible number, there will be a warning to that effect and \code{n} will be reset to the largest possible number, i.e. the function behaves as if \code{n = NULL} (the default)
#'
#' @author Christof Neumann
#' @return a list with 5 items
#' \itemize{
#' \item \code{$seldata} the subset of \code{xdata} with the selected rows
#' \item \code{$unseldata} the subset of \code{xdata} with the rows that were not selected
#' \item \code{$sel} the row indices of the selected rows
#' \item \code{$unsel} the row indices of the rows not selected
#' \item \code{$factors} the balance factor(s) (= \code{whattobalance})
#' }
#' @export
#'
#' @examples
#' set.seed(123)
#' xdata <- data.frame(ID = sample(letters[1:4], 30, replace = TRUE),
#' context = sample(LETTERS[21:22], 30, replace = TRUE),
#' var1 = rnorm(30), var2 = rnorm(30))
#' table(xdata$ID, xdata$context)
#' balancedataset(xdata = xdata, whattobalance = c("context"), n = 2)$seldata
#' balancedataset(xdata = xdata, whattobalance = c("context"), n = 3)$seldata
#' balancedataset(xdata = xdata, whattobalance = c("context"))$seldata
#'
#' # with two factors
#' balancedataset(xdata = xdata, whattobalance = c("context", "ID"), n = 1)$seldata
#'
#' # one combination occurs only once (d/V): row 27 has to be in each data set
#' table(xdata$ID, xdata$context)
#' x <- sapply(1:50, function(X){
#' row.names(balancedataset(xdata = xdata, whattobalance = c("context", "ID"))$seldata)
#' })
#' table(x)
#'
#'
balancedataset <- function(xdata, whattobalance, n = NULL) {
if (sum(whattobalance %in% colnames(xdata)) != length(whattobalance)) {
stop("not all factors found in data set", call. = FALSE)
}
# transform into character, so as not having to deal with (empty) factor levels
fac1 <- as.character(xdata[, whattobalance[1]])
# if there is only one such factor to balance over
if (length(whattobalance) == 1) {
# get max n
maxn <- min(table(fac1))
if (!is.null(n)) {
if (n > maxn) {
warning(paste0("'n' is larger than the largest possible value, which is ", maxn, ", and has been set to this value"), call. = FALSE)
n <- maxn
}
}
if (is.null(n)) n <- maxn
# if maxn >= 2 means that we can sample without issue
# if maxn == 1 means that if there is only one possibility, then sample(36, size = 1) would not return 36 each time but a value between 1 and 36, which is undesired behaviour and I haven't found a way around this yet (sample.int()?)
if (maxn > 1) {
x <- as.matrix(data.frame(table(fac1)))
sel <- as.numeric(apply(x, 1, function(X) sample(which(fac1 == X[1]), size = n)))
}
if (maxn == 1) {
x <- as.matrix(data.frame(table(fac1)))
sel <- numeric(nrow(x))
for (i in 1:nrow(x)) {
if (x[i, "Freq"] == "1") sel[i] <- which(fac1 == x[i, 1])
if (x[i, "Freq"] != "1") sel[i] <- sample(which(fac1 == x[i, 1]), size = 1)
}
}
}
if (length(whattobalance) == 2) {
fac2 <- as.character(xdata[, whattobalance[2]])
maxn <- min(table(fac1, fac2))
if (maxn == 0) {
stop("at least one combination does not occur in data", call. = FALSE)
}
if (!is.null(n)) {
if (n > maxn) {
warning(paste0("'n' is larger than the largest possible value, which is ", maxn, ", and has been set to this value"), call. = FALSE)
n <- maxn
}
}
if (is.null(n)) n <- maxn
if (maxn > 1) {
x <- as.matrix(data.frame(table(fac1, fac2)))[, 1:2]
sel <- as.numeric(apply(x, 1, function(X) {
sample(which(fac1 == X[1] & fac2 == X[2]), size = n)
}))
}
if (maxn == 1) {
x <- as.matrix(data.frame(table(fac1, fac2)))
sel <- numeric(nrow(x))
for (i in 1:nrow(x)) {
if (x[i, "Freq"] == "1") {
sel[i] <- which(fac1 == x[i, 1] & fac2 == x[i, 2])
}
if (x[i, "Freq"] != "1") {
sel[i] <- sample(which(fac1 == x[i, 1] & fac2 == x[i, 2] ), size = 1)
}
}
}
}
# prepare results
nr <- 1:nrow(xdata)
# unselected rows
unsel <- nr[!nr %in% sel]
res <- list()
res$seldata <- xdata[sort(sel), ]
res$unseldata <- xdata[unsel, ]
res$sel <- sort(sel)
res$unsel <- unsel
res$factors <- whattobalance
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.