#' Find smallest subset to exclude from clustered sample for effect/pattern to disappear
#'
#' The function iteratively learns which groups (see argument grouping_var) should at least be excluded from
#' the data to reach a conservative 'goal value' for the statistic of interest.
#' It does so by relying on a genetic algorithm, which efficiently explores the (usually vast)
#' space of possible subsets. The result can uncover impactful subsamples and fuel discussions of robustness.
#' Necessary arguments include the dataframe,
#' a function to compute the statistic of interest ('statistic_computation' see examples), the column with the grouping variable ('grouping_var'),
#' and the goal value of interest.
#'
#' @param data A data.frame containing the observations as rows.
#' @param goal_value This conservative value (e.g., small effect size) is targeted.
#' @param statistic_computation A formula which has 'data' as input and returns the statistic of interest.
#' @param pop Number of 'individuals' in each generation of the genetic algorithm.
#' @param max_generations Maximum number of generations that the algorithm generates.
#' @param exclusion_cost Used to calibrate fitness function.
#' @param prop_included_cases Initial proportion of included groups (e.g. .90).
#' @param chance_of_mutation Chance that a gene mutates, higher is slower but more accurate (e.g. .02).
#' @param stop_search After how many generations without change is the 'converged' result returned.
#' @param random_seed Seed for replicability.
#' @param max_exclusions maximum number of groups to be excluded
#'
#' @return Vector of zeros and ones with length equal to number of observations in data. Ones indicate exclusion.
#'
#' @examples
#' set.seed(42)
#' groups = c(0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8)
#' v1 = rnorm(length(groups))+0.7
#' v2 = rnorm(length(groups))
#'
#' df = as.data.frame(cbind(groups, v1,v2))
#'
#' st = function(data){
#' t.test(data$v1, data$v2)$p.value
#' }
#'
#' multilevel_break(df, statistic_computation = st, goal_value = 0.05, grouping_var = 'groups', max_exclusions = 8)
#'
#' @export
multilevel_break = function(data = NULL,#a data.frame containing the observations as rows
goal_value = NULL, #how many cases need to be excluded to reach this value
statistic_computation = NULL, #formula for the computation of the statistic
grouping_var = NULL,
max_exclusions = NULL,
pop = 200, #population size of each generation (number datasets), higher is slower but more accurate min 500
max_generations = 300, #maximum number of generations. higher is slower but more accurate min 100
exclusion_cost = 0.01, #stable exclusion costs, should not need tuning
prop_included_cases = 0.90, #initial proportion of included cases (0-1), lower is much slower but more accurate
chance_of_mutation = 0.05, #chance that a gene mutates, higher is slower but more accurate max 0.1
stop_search = 100,#after how many generations without improvements is result returned
random_seed = 42){
# setup -------------------------------------------------------------------
set.seed(random_seed)
if(is.null(max_exclusions)){
max_exclusions = length(unique(data[,grouping_var]))
print(paste("max_exclusions argument was not provided. Using ", max_exclusions, " as limit."))
}
if(is.null(data)){
stop("Please provide the data under the 'data' argument.")
}
if(is.null(statistic_computation)){
stop("A function must be provided as the 'statistic_computation' argument which takes data as input and returns the statistic of interest.")
}
if(is.null(goal_value)){
stop("A numeric value must be provided as the 'goal_value' argument which indcates the conservative point of interest for the researcher.")
}
if(is.null(grouping_var)){
stop("Please provide the name of the column containing the grouping variable (grouping_var)")
}
# define custom version of genalg function ---------------------------------------
#original version here: https://github.com/cran/genalg/tree/master/R
rbga.bin <- function(
size=NA,
suggestions=NULL,
popSize=NA, iters=NA,
mutationChance=NA,
elitism=NA, zeroToOneRatio=NA,
monitorFunc=NULL, evalFunc=NULL,
showSettings=FALSE, verbose=FALSE,
parentProb= NA, stop_search = NA
) {
if (is.null(evalFunc)) {
stop("An evaluation function must be provided. See the evalFunc parameter.");
}
vars = size;
if (is.na(mutationChance)) {
mutationChance = 1/(vars+1);
}
if (is.na(elitism)) {
elitism = floor(popSize/5)
}
if (verbose) cat("Testing the sanity of parameters...\n");
if (popSize < 5) {
stop("The population size must be at least 5.");
}
if (iters < 1) {
stop("The number of iterations must be at least 1.");
}
if (!(elitism < popSize)) {
stop("The population size must be greater than the elitism.");
}
if (showSettings) {
if (verbose) cat("The start conditions:\n");
result = list(size=size, suggestions=suggestions,
popSize=popSize, iters=iters,
elitism=elitism, mutationChance=mutationChance);
class(result) = "rbga";
cat(summary(result));
} else {
if (verbose) cat("Not showing GA settings...\n");
}
if (vars > 0) {
if (!is.null(suggestions)) {
if (verbose) cat("Adding suggestions to first population...\n");
population = matrix(nrow=popSize, ncol=vars);
suggestionCount = dim(suggestions)[1]
for (i in 1:suggestionCount) {
population[i,] = suggestions[i,]
}
if (verbose) cat("Filling others with random values in the given domains...\n");
for (child in (suggestionCount+1):popSize) {
population[child,] = sample(c(rep(0,zeroToOneRatio),1), vars, replace=TRUE);
while (sum(population[child,]) == 0) {
population[child,] = sample(c(rep(0,zeroToOneRatio),1), vars, replace=TRUE);
}
}
} else {
if (verbose) cat("Starting with random values in the given domains...\n");
# start with an random population
population = matrix(nrow=popSize, ncol=vars);
# fill values
for (child in 1:popSize) {
population[child,] = sample(c(rep(0,zeroToOneRatio),1), vars, replace=TRUE);
while (sum(population[child,]) == 0) {
population[child,] = sample(c(rep(0,zeroToOneRatio),1), vars, replace=TRUE);
}
}
}
# do iterations
stability = 1
bestsolution = 999999
bestEvals = rep(NA, iters);
meanEvals = rep(NA, iters);
evalVals = rep(NA, popSize);
for (iter in 1:iters) {
if (verbose) cat(paste("Starting iteration", iter, "\n"));
# calculate each object
if (verbose) cat("Calucating evaluation values... ");
for (object in 1:popSize) {
if (is.na(evalVals[object])) {
evalVals[object] = evalFunc(population[object,]);
if (verbose) cat(".");
}
}
if(stability == stop_search){
print(paste(stop_search,"iterations without change"))
result = list(type="binary chromosome", size=size,
popSize=popSize, iters=iters, suggestions=suggestions,
population=population, elitism=elitism, mutationChance=mutationChance,
evaluations=evalVals, best=bestEvals, mean=meanEvals, stability = stability, stop_search = stop_search);
class(result) = "rbga";
return(result);
}
bestEvals[iter] = min(evalVals);
if (min(evalVals) == bestsolution){
stability = stability +1;
bestsolution = min(evalVals);
}else{
bestsolution = min(evalVals);
stability = 1
}
meanEvals[iter] = mean(evalVals);
if (verbose) cat(" done.\n");
if (!is.null(monitorFunc)) {
if (verbose) cat("Sending current state to rgba.monitor()...\n");
# report on GA settings
result = list(type="binary chromosome", size=size,
popSize=popSize, iter=iter, iters=iters,
population=population, elitism=elitism, mutationChance=mutationChance,
evaluations=evalVals, best=bestEvals, mean=meanEvals, stability = stability, stop_search = stop_search);
class(result) = "rbga";
monitorFunc(result);
}
if (iter < iters) { # ok, must create the next generation
if (verbose) cat("Creating next generation...\n");
newPopulation = matrix(nrow=popSize, ncol=vars);
newEvalVals = rep(NA, popSize);
if (verbose) cat(" sorting results...\n");
sortedEvaluations = sort(evalVals, index=TRUE);
sortedPopulation = matrix(population[sortedEvaluations$ix,], ncol=vars);
# save the best
if (elitism > 0) {
if (verbose) cat(" applying elitism...\n");
newPopulation[1:elitism,] = sortedPopulation[1:elitism,];
newEvalVals[1:elitism] = sortedEvaluations$x[1:elitism]
} # ok, save nothing
# fill the rest by doing crossover
if (vars > 1) {
if (verbose) cat(" applying crossover...\n");
for (child in (elitism+1):popSize) {
# ok, pick two random parents
parentIDs = sample(1:popSize, 2, prob= parentProb)
parents = sortedPopulation[parentIDs,];
crossOverPoint = sample(0:vars,1);
if (crossOverPoint == 0) {
newPopulation[child, ] = parents[2,]
newEvalVals[child] = sortedEvaluations$x[parentIDs[2]]
} else if (crossOverPoint == vars) {
newPopulation[child, ] = parents[1,]
newEvalVals[child] = sortedEvaluations$x[parentIDs[1]]
} else {
newPopulation[child, ] =
c(parents[1,][1:crossOverPoint],
parents[2,][(crossOverPoint+1):vars])
while (sum(newPopulation[child,]) == 0) {
newPopulation[child,] = sample(c(rep(0,zeroToOneRatio),1), vars, replace=TRUE);
}
}
}
} else { # otherwise nothing to crossover
if (verbose) cat(" cannot crossover (#vars=1), using new randoms...\n");
# random fill the rest
newPopulation[(elitism+1):popSize,] =
sortedPopulation[sample(1:popSize, popSize-elitism),];
}
population = newPopulation;
evalVals = newEvalVals;
# do mutation
if (mutationChance > 0) {
if (verbose) cat(" applying mutations... ");
mutationCount = 0;
for (object in (elitism+1):popSize) {
for (var in 1:vars) {
if (runif(1) < mutationChance) { # ok, do mutation
population[object,var] = sample(c(rep(0,zeroToOneRatio),1), 1);
mutationCount = mutationCount + 1;
}
}
}
if (verbose) cat(paste(mutationCount, "mutations applied\n"));
}
}
}
}
# report on GA settings
result = list(type="binary chromosome", size=size,
popSize=popSize, iters=iters, suggestions=suggestions,
population=population, elitism=elitism, mutationChance=mutationChance,
evaluations=evalVals, best=bestEvals, mean=meanEvals, stop_search = stop_search, stability = stability);
class(result) = "rbga";
return(result);
}
# check if maximize or minimize -------------------------------------------
v = statistic_computation(data)
if(v > goal_value){
maximize = FALSE
}else if(v < goal_value){#not optimal to always recompute, should also check if numeric
maximize = TRUE
}else if(v == goal_value){
stop("Your goal value is equal to the observed statistic in the full sample")
}
# implement evaluate function based on statistic_computation ---------------
if(maximize){
evaluate = function(string=c()){
if(sum(string)> max_exclusions){
return(999999)
}
chosen_groups = unique(data[,grouping_var])[0 == string]
x = statistic_computation(data[is.element(data[,grouping_var],chosen_groups),])
return(sum(string)/length(string)*exclusion_cost + 1/min(x, exclusion_cost^4 * x + goal_value))
}
} else if(!maximize){
evaluate = function(string=c()){
if(sum(string)> max_exclusions){
return(999999)
}
chosen_groups = unique(data[,grouping_var])[0 == string]
x = statistic_computation(data[is.element(data[,grouping_var],chosen_groups),])
return(sum(string)/length(string)*exclusion_cost + max(x, exclusion_cost^4 * x + goal_value))
}
}
#
# implement monitor function based on statistic_computation ---------------
monitor = function(alg){
minEval = min(alg$evaluations)
filter = alg$evaluations == minEval
bestObjectCount = sum(rep(1, alg$popSize)[filter])
if (bestObjectCount > 1) {
bestSolution = alg$population[filter,][1,]
} else {
bestSolution = alg$population[filter,]}
chosen_groups = unique(data[,grouping_var])[0 == bestSolution]
x = statistic_computation(data[is.element(data[,grouping_var],chosen_groups),])
print(alg$iter)
stab = alg$stability
stop = alg$stop_search
# print(paste("dropped observations:", sum(bestSolution)))
# print(paste("observed statistic:", x))
cat('Dropped groups: ', sum(bestSolution), ',', ' Target statistic: ', x, ',', ' Convergence (Generations w.o. change): ', stab , '/', stop, '\n', sep = '')
}
# function for returning best solution after alg finished ---------------------------------------------
best <- function(alg) {
minEval = min(alg$evaluations, na.rm = T)
filter = alg$evaluations == minEval
bestObjectCount = sum(rep(1, alg$popSize)[filter])
# ok, deal with the situation that more than one object is best
if (bestObjectCount > 1) {
bestSolution = alg$population[filter,][1,]
} else {
bestSolution = alg$population[filter,]}
bestSolution}
print(length(unique(data[,grouping_var])))
# run genetic alg with binary genes ------------------------------------------------------------
alg = rbga.bin(size=length(unique(data[,grouping_var])),
suggestions=NULL,
popSize=pop, iters=max_generations,
mutationChance=chance_of_mutation,
elitism=floor(0.1*pop), zeroToOneRatio=(prop_included_cases *length(unique(data[,grouping_var]))) / ((1-prop_included_cases)* length(unique(data[,grouping_var])) + 1^-10),#genes,
monitorFunc=monitor, evalFunc=evaluate,
showSettings=FALSE, verbose=FALSE, stop_search = stop_search,
parentProb= dnorm(1:pop, mean=0, sd=(pop/3)))
# output best filter -------------------------------------------------------------
solution = best(alg)
excluded_groups = unique(data[,grouping_var])[solution == 1]
nr_excluded_groups = length(excluded_groups)
original_value = statistic_computation(data)
new_value = statistic_computation(data[!is.element(data[,grouping_var],excluded_groups),])
output = list("excluded groups" = excluded_groups, "number_exclusions"= nr_excluded_groups, "new_value" = new_value, "original_value" = original_value)
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.