#' Create sets having minimal differences
#'
#' Is used to assign a set of items to N groups. Differences between
#' groups are minimized with regard to specified criteria (E.g.:
#' minimize differences in mean test scores between school classes).
#'
#' @param dat A data.frame containing the set that is to be
#' regrouped. All assignment criteria must be columns of this
#' data.frame.
#' @param criteria_scale A string vector naming all continuous,
#' numerical columns in `dat` that are to be considered as criteria
#' in set assignment. Can be left out if only nominal variables are
#' to be equalized between groups.
#' @param criteria_nominal A string vector naming all nominal column
#' variables in `dat`, that are to be considered as criteria in set
#' assignment. Can be left out, if only continuous variables are to
#' be equalized between groups. A maximum of two nominal criteria
#' can be realized.
#' @param sets_n How many equal groups are to be created.
#' @param repetitions How many random assignments are to be tested. Only
#' use if `exact` == FALSE.
#' @param exact Should _all_ possible assignments be tested? This yields
#' the "optimal" solution given the assignment ciriteria and
#' functions. Defaults to `FALSE`, in which case a random subset of
#' all possible assignments will be tested.
#' @param tolerance_nominal Use only if argument `criteria_nominal` is
#' also passed. This argument indicates the tolerated frequency
#' deviations for nominal variables (and their combinations) between
#' newly created sets. Must be a one-value vector if one nominal
#' variable is passed; must be a three-value vector if two nominal
#' variables are passed (the second value is the tolerance value for
#' the second variable and the third value is the tolerance value
#' for the combinations of both variables). It is possible that no
#' assignment will be found that fits the tolerance requirements; if
#' unsure how to use this parameter, start using large tolerance
#' values and observe the group assigments.
#' @param equalize A list of functions. These functions determine which
#' criterion is minimized between sets: differences in function
#' return values are minimized. The default function that is
#' operated on is `mean`; in this case, the mean values of the
#' specified criteria (via argument `criteria_scale`) are matched
#' between sets. Can be any function that returns a single value
#' vector.
#' @param write_file Boolean. Will newly found better fitting sets be
#' written to a file automatically? (This is helpful if your
#' simulation runs unexpectedly long and you need to kill it; in
#' this case the best match is not lost). Defaults to `FALSE`.
#' @param talk Boolean. If `TRUE`, the function will print its progress.
#'
#' @return A \code{data.frame}. Contains all columns from argument `dat`
#' and additionally a column variable `newSet`. This columns
#' contains the set assigment of items to groups that produced the
#' best fit in the previous iterations.
#'
#' @export
#'
#' @author Martin Papenberg \email{martin.papenberg@@hhu.de}
#'
create_groups <- function(dat, criteria_scale=NULL,
criteria_nominal=NULL, sets_n,
repetitions=100, exact = FALSE,
tolerance_nominal=rep(Inf, 3),
equalize=list(mean), write_file = FALSE,
talk = TRUE) {
## How many items are to be reassigned:
cases <- nrow(dat)
## Check for errors and warnings
checkInput(dat, sets_n, criteria_scale, criteria_nominal,
tolerance_nominal, cases)
## Initialize a vector that encodes the assignment to groups
setAssign <- sort(rep_len(1:sets_n, cases))
## Check if a set was passed that was optizmized in a recent run
if (!is.null(dat$newSet)) {
if (length(unique(dat$newSet)) != sets_n) {
stop(paste("Variable newSet was found in data.frame
that was passed. Number of different groups in variable
newSet is not equal to the value that was passed via
argument `sets_n`"))
}
if (talk)
cat("Variable newSet was found - trying to improve previous optimization \n")
best_var <- checkVar(dat, criteria_scale, equalize)
best_assign <- dat$newSet
} else { ## -> start from scratch
best_var <- Inf # start value of variance between new sets
best_assign <- NULL
}
if (exact == TRUE) {
repetitions <- all_combinations(table(setAssign))
}
## for user output
partials <- ceiling(repetitions / 10)
## START iterating
if (talk)
cat("Start iteration 1 of", repetitions, "-", format(Sys.time(), "%a %b %d %X %Y"), "\n")
i <- 1
while (i <= repetitions) {
## generate set assignment:
if (exact == FALSE) {
setAssign <- sample(setAssign)
} else {
setAssign <- next_permutation(setAssign)
}
dat$newSet <- setAssign
## Some output for user:
if (i %% partials == 0 & talk) {
cat("working on iteration", i, "\n")
}
## Check nominal criteria - only use this assignment if the
## nominal criteria are satisfied
if (!is.null(criteria_nominal)) {
nominal_okay <- check_nominal(dat, criteria_nominal, tolerance_nominal, cases)
if (!nominal_okay) {
i <- i + 1
next
}
}
## NOW CONTINUOUS VARIABLES ARE CHECKED: What is done: Minimize
## variance between item sets in regard to the functions
## specified in argument `equalize`; `mean` is the default
## function for which differences between sets are minimized.
if (!is.null(criteria_scale)) {
## check the sum of the deviation variance in all criteria
## for this assignment:
sumVar <- checkVar(dat, criteria_scale, equalize)
## Better fit was found, save the assignment
if (sumVar < best_var) {
if (talk)
cat("Success! Improved set similarity on iteration", i, "\n")
best_assign <- setAssign
best_var <- sumVar
write_file(write_file, dat)
}
}
i <- i + 1
}
## END of all iterations
if (is.null(best_assign)) {
warning("No assignment was found satisfying the restrictions in your nominal criteria. Try to increase the `tolerance_nominal` parameter or increase `repetitions`.")
}
## return best assignment:
dat$newSet <- best_assign
write_file(write_file, dat)
if (talk)
cat("End of iteration", repetitions, "-", format(Sys.time(), "%a %b %d %X %Y"), "\n")
return(dat)
}
write_file <- function(write, dat) {
if (write) {
write.table(file="newSet.csv", dat, dec=",", sep=";",
row.names=FALSE)
}
}
## Method that checks the user input and generates error or warning messsages
checkInput <- function(dat, sets_n, criteria_scale, criteria_nominal,
tolerance_nominal, cases) {
## CHECK FOR ERRORS IN USER INPUT
errMsg <- "Error in function `create_groups`:"
if (all(class(dat) != "data.frame")) {
stop(paste(errMsg, "first argument must be a data.frame"))
}
if (length(criteria_nominal) > 2) {
stop(paste(errMsg, "only two nominal variables can be considered in set creation."))
}
if (length(criteria_nominal) == 2 & length(tolerance_nominal) != 3) {
stop(paste(errMsg, "Three tolerance_nominal values must be passed",
"if two nominal criteria are given."))
}
for (i in c(criteria_scale, criteria_nominal)) {
if( is.na( match(i, names(dat)) )) {
stop(paste(errMsg, "`", i, "` is not a column in the passed data.frame", sep=""))
}
}
## no criterion was passed: Abort
if (is.null(criteria_scale) && is.null(criteria_nominal)) {
stop(paste(errMsg, "no assignment criterion was passed."))
}
## Warnings
## set number does not divide total number of cases, proceed but let user know
if (cases %% sets_n != 0) {
warning(paste("Set number (", sets_n, ") does not divide length of data (",
cases, "). Sets have unequal sizes.", sep=""))
}
## check for NA values in criterion columns
test.frame <- data.frame(dat[criteria_scale], dat[criteria_nominal])
no.na <- sum(apply(test.frame, 2, function(x) any(is.na(x)))) == 0
if (!no.na) {
warning("Warning: criteria contained NA values.")
}
}
## method that checks if the group assigment is consistent with the
## specified tolerance level for deviation in categorical variables:
check_nominal <- function(itemDataRnd, criteria_nominal,
tolerance_nominal, cases) {
## Check if nominal variables are OK
## Nominal variable checking is done in the following WHILE loop
number_criteria_nominal <- length(criteria_nominal)
nominal_satisfied <- FALSE
## how is first nominal variable in the new sets distributed
tmpNomData_1 <- unlist(itemDataRnd[criteria_nominal[1]])
tmpNomDistr_1 <- table(itemDataRnd$newSet, tmpNomData_1)
if (length(criteria_nominal) == 1) { # only one nominal variable was passed
devFromPerfect_1 <- max(check2d_table(tmpNomDistr_1))
assignFailed <- devFromPerfect_1 > tolerance_nominal[1]
} else if (length(criteria_nominal) == 2) { # two nominal criteria
tmpNomData_2 <- unlist(itemDataRnd[criteria_nominal[2]])
tmpNomDistr_2 <- table(itemDataRnd$newSet, tmpNomData_2)
tmpNomDistr_intAct <- table(itemDataRnd$newSet, tmpNomData_1, tmpNomData_2)
devFromPerfect_1 <- max(check2d_table(tmpNomDistr_1))
devFromPerfect_2 <- max(check2d_table(tmpNomDistr_2))
devFromPerfect_int <- max(check3d_table(tmpNomDistr_intAct))
assignFailed <- (devFromPerfect_1 > tolerance_nominal[1]) |
(devFromPerfect_2 > tolerance_nominal[2]) |
(devFromPerfect_int > tolerance_nominal[3])
}
## check if assignment of nominal variables to new sets is as required
if (!assignFailed) {
nominal_satisfied <- TRUE
}
return(nominal_satisfied)
}
create_weight_matrix <- function(criteria_scale, equalize) {
mat <- matrix(ncol = length(criteria_scale), nrow=length(equalize))
rownames(mat) <- as.character(substitute(c(mean, sd)))[-1]
colnames(mat) <- criteria_scale
return(mat)
}
# method that returns the total variance between groups in all scale
# criteria
checkVar <- function(data, criteria_scale, equalize) {
# varAllCrits: total variance between sets for all criteria in all `equalize`
# functions
varAllCrits <- NULL
for (t in 1:length(criteria_scale)) {
# tmpVar_1Crit: variance between sets in the t'th criterion
# in all `equalize` functions
tmpVar_1Crit <- NULL
for (j in 1:length(equalize)) {
tmpMeans <- tapply(scale(data[criteria_scale[t]]),
data$newSet, FUN=equalize[[j]], na.rm =TRUE)
tmpVar_1Crit <- c(tmpVar_1Crit, var(tmpMeans, na.rm =TRUE))
}
varAllCrits <- c(varAllCrits, tmpVar_1Crit)
}
sumVar <- sum(varAllCrits)
return(sumVar)
}
# function that checks if frequency differences of nominal variables are between
# sets; check for one criterion (~ 2D table)
check2d_table <- function(d2_table) {
# check dimension of table
if ( length(dim(d2_table)) != 2 ) { stop("error: table dimension must be 2") }
numberCols <- dim(d2_table)[2]
differences <- vector(length=numberCols)
for (i in 1:numberCols) {
differences[i] <- max(d2_table[,i]) - min(d2_table[,i])
}
return(differences)
}
# function that checks if frequencies between nominal sets are okay
# in a 3D table
check3d_table <- function(d3_table) {
if ( length(dim(d3_table)) != 3 ) { stop("error: table dimension must be 3") }
number3dim <- dim(d3_table)[3]
differences <- NULL
for (i in 1:number3dim) {
differences <- c(differences, check2d_table(d3_table[,,i]))
}
return(differences)
}
#' Get the next permutation
#'
#' Each permutation is computed on basis of a passed permutation where
#' lexicographic ordering of the permutations is used to determine the
#' next "higher" permutation. This is an adaption of the
#' `next_permutation` function in C++.
#'
#' @param permutation A vector of elements.
#'
#' @return The next higher permutation of the elements in vector
#' `permutation` with regard to its lexicographic ordering.
#'
#' @export
#'
#' @author Martin Papenberg \email{martin.papenberg@@hhu.de}
#'
next_permutation <- function(permutation) {
n <- length(permutation)
last <- permutation[n]
i <- n
while(last <= permutation[i-1]) {
last <- permutation[i-1]
i <- i - 1
## if lexicographic order is already at the maximum:
if (i-1 == 0) return(sort(permutation))
}
## this algorithm divides the input in a head and a tail; the tail
## is monotonically decreasing
head <- permutation[1:(i-1)]
tail <- permutation[i:length(permutation)]
## which element in the tail is the smallest element that is larger
## than the last element in the head?
larger_values <- tail[tail > head[length(head)]]
## last element of the head:
final_head <- head[length(head)]
## replace last element of head by smallest larger value in tail
head[length(head)] <- min(larger_values)
## replace smallest larger value in tail by final head element
tail[max(which(tail == min(larger_values)))] <- final_head
## reverse tail before returning
return(c(head, rev(tail)))
}
#' Compute the number of all possible set assignments
#'
#' @param x A vector of N set sizes
#'
#' @return The number of all possible assignments to N sets
#'
all_combinations <- function(x) {
set_size <- c(sum(x), sum(x) - cumsum(x))
result <- 1
for (i in 1:(length(x)-1)) {
result <- result * choose(set_size[i], x[i])
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.