#' Split Strata
#' Splits pre-defined sampling strata based on values of a
#' continuous or categorical variable.
#' For splits on continuous variables, the new strata are defined
#' on left-open intervals. The only exception is the first interval,
#' which must include the overall minimum value. The names of the newly
#' created strata for a split generated
#' from a continuous value are the `split_var` column name with
#' the range of values defining that stratum appended to the
#' old strata name. For a categorical split, the new strata names
#' are the `split_var` column name appended to the
#' 1/0 logical flag specifying whether the unit is in \code{split at},
#' all appended to the old strata name.
#' If the `split_var` column name is long,
#' the user can specify a value for `trunc` to prevent the new
#' strata names from being inconveniently long.
#' @param data a dataframe or matrix with one row for each
#' sampling unit, one column specifying each unit's current
#' stratum, one column containing the continuous or categorical
#' values that will define the split, and any other relevant
#' columns.
#' @param strata a character string specifying the name of the
#' column that defines each unit's current strata.
#' @param split the name of the stratum or strata to be split,
#' exactly as they appear in \code{strata}. Defaults to NULL,
#' which indicates that all strata in \code{strata} will be split.
#' @param split_var a character string specifying the name of the
#' column that should be used to define the strata splits.
#' @param split_at the percentile, value, or name(s) which
#' \code{split_var} should be split at. The interpretation of
#' this input depends on \code{type}. For \code{"quantile"} types,
#' input must be between \code{0} and \code{1}. Defaults to
#' \code{0.5} (median). For \code{"categorical"} type, the
#' input should be a vector of values or names in \code{split_var}
#' that define the new stratum.
#' @param type a character string specifying how the function
#' should interpret the \code{split_at} argument. Must be one of:
#' \itemize{
#' \item \code{"global quantile"}, the default, splits the strata
#' at the quantiles specified in \code{split_at} defined along
#' the entire, unfiltered \code{split_var} column.
#' \item \code{"local quantile"} splits the strata at the
#' quantiles specified in \code{split_at} defined along the
#' filtered \code{split_var} column which only includes units in
#' the stratum being split.
#' \item \code{"value"} splits the strata at the values specified
#' in \code{split_at} along \code{split_var} column.
#' \item \code{"categorical"} splits the strata into two new
#' strata, one that contains each unit where \code{split_var}
#' matches an input of \code{split_at}, and a second that contains
#' every other unit.
#' }
#' @param trunc A numeric or character value specifying how the
#' name of the \code{split_var} should be truncated when naming
#' the new strata. If numeric, the new strata name will only
#' include the first 'n' characters of the \code{split_var} name.
#' If character, the specified string will be used to name the new
#' strata instead of the \code{split_var} name. Defaults to
#' \code{NULL}, which creates the new strata name using the entire
#' name of the \code{split_var} column.
#' @examples
#' x <- split_strata(iris, "Sepal.Length",
#' strata = c("Species"),
#' split = "setosa", split_var = "Sepal.Width",
#' split_at = c(0.5), type = "global quantile"
#' )
#' # You can split at more than one quantile in one call.
#' # The above call splits the "setosa" stratum into three of equal size
#' x <- split_strata(iris, "Sepal.Length",
#' strata = c("Species"),
#' split = "setosa", split_var = "Sepal.Width", split_at = c(0.33, 0.66),
#' type = "local quantile"
#' )
#' # Manually select split values with type = "value"
#' x <- split_strata(iris, "Sepal.Length",
#' strata = "Species",
#' split = "setosa", split_var = "Sepal.Width",
#' split_at = c(3.1, 3.8), type = "value"
#' )
#' # Perform a categorical split.
#' iris$strata <- rep(c(rep(1, times = 25), rep(0, times = 25)), times = 3)
#' x <- split_strata(iris, "Sepal.Length",
#' strata = "strata",
#' split = NULL, split_var = "Species",
#' split_at = c("virginica", "versicolor"), type = "categorical"
#' )
#' # Splits each initial strata 1 and 2 into one stratum with "virginia"
#' # and "versicolor" species and one stratum with all of the other species
#' # not specified in the split_at argument.
#' @export
#' @return Returns the input dataframe with a new column named
#' 'new_strata' that holds the name of the stratum that each
#' sample belongs to after the split. The column containing the
#' previous strata names is retained and given the name "old_strata".
#' @importFrom magrittr %>%
split_strata <- function(data, strata, split = NULL,
split_var, type = "global quantile",
split_at = .5, trunc = NULL) {
old_strata <- split_variable <- split_var_updated <- NULL
# bind global vars as necessary
if (is.matrix(data)) {
data <-
if ( == FALSE) {
stop("'data' must be a dataframe or matrix with named columns")
if (split_var %in% names(data) == FALSE) {
stop("'split_var' must be a string matching a column name of 'data'")
if (all(strata %in% names(data)) == FALSE) {
stop("'Strata' must be a string or vector of strings
matching column names of data.")
if (all(strata != "old_strata") & "old_strata" %in% names(data)) {
data <- dplyr::select(data, -old_strata)
# fixes error from assigning duplicate names
if (length(strata) > 1) {
strata_q <- enquo(strata)
strata_interact <- data %>%
strata_interact <- interaction(strata_interact)
data <- cbind(data, strata_interact)
data <- dplyr::select(data, -!!strata_q)
names(data)[names(data) == "strata_interact"] <- "old_strata"
} else {
names(data)[names(data) == strata] <- "old_strata"
if (is.null(split) == FALSE) {
# split <- match.arg(split, unique(data[,"old_strata"]))
if (all(split %in% data[, "old_strata"]) == FALSE) {
paste("'", split, "'", sep = ""),
"does not match any value in 'strata'"
} else if (is.null(split) == TRUE) {
split <- sort(unique(data[, "old_strata"]))
# Check for a column that is sorted to preserve order later.
test <- vector()
for (i in seq_len(ncol(data))) {
test[i] <- is.unsorted(data[, i])
if ([i])) {
test[i] <- TRUE
if (test[i] == FALSE) {
if (any(test == FALSE)) {
sort_by <- names(data)[which(test == FALSE)]
} else {
sort_by <- NULL
names(data)[names(data) == split_var] <- "split_variable"
data$old_strata <- as.character(data$old_strata)
type <- match.arg(
"global quantile", "local quantile",
"value", "categorical"
if (type %in% c(
"global quantile", "local quantile", "value",
) == FALSE) {
stop("'type' must be one of 'global quantile',
'local quantile', 'value', 'categorical'")
if (length(trunc) > 1) {
stop("'trunc' must be a single numeric or character value
specifying how the name of 'split_var' should be used in
the new strata names")
# Create new_name to use for new strata names.
if (is.null(trunc) == FALSE) {
if (is.numeric(trunc) & trunc > 0) {
new_name <- substr(split_var, 1, trunc)
} else if (is.numeric(trunc) & trunc < 0) {
new_name <- substr(
split_var, nchar(split_var) + trunc + 1,
} else if (is.character(trunc)) {
new_name <- trunc
} else {
stop("'trunc' must be a single numeric or character value
specifying how the name of 'split_var' should be used
in the new strata names")
} else if (is.null(trunc) == TRUE) {
new_name <- split_var
} else {
stop("'trunc' must be a single numeric or character value
specifying how the name of 'split_var' should be used in
the new strata names")
if (type %in% c("global quantile", "local quantile", "value") &
length(split) <= 1) {
if (is.numeric(data$split_variable) == FALSE) {
stop("'split_var' must be a column of 'data' holding numeric
values. If you want to split on a categorical variable,
use type = 'categorical'.")
if (type == "global quantile" |
(is.null(split) & type == "local quantile")) {
cut_point <- sort(stats::quantile(
data[, "split_variable"],
)) # Find cut points
if (type == "local quantile" & is.null(split) == FALSE) {
cut_point <- sort(stats::quantile(
data[data$old_strata == split, "split_variable"], split_at
if (type == "value") {
cut_point <- sort(split_at)
if (is.null(split) == FALSE) {
if (any(split_at < min(data[
data$old_strata == split,
])) |
any(split_at > max(data[
data$old_strata == split,
]))) {
warning("value(s) of 'split_at' are outside of the range
of values in 'split'")
if (is.null(split)) {
if (any(split_at < min(data$split_variable)) |
any(split_at > max(data$split_variable))) {
warning("value(s) of 'split_at' are outside of the range
of values in 'split'")
if (is.null(split) == FALSE) {
data_filtered <- data %>%
dplyr::filter(old_strata == split)
if (is.null(split) == TRUE) {
data_filtered <- data
if (length(cut_point) == 1) {
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable <= cut_point[1],
paste("[", round(min(data_filtered$split_variable),
digits = 2
), ",",
round(cut_point[1], digits = 2), "]",
sep = ""
sep = "_"
paste("(", round(cut_point[1],
digits = 2
), ",",
digits = 2
), "]",
sep = ""
sep = "_"
if (length(cut_point) > 1) {
cut_point <- c(
data_filtered$split_var_updated <- data_filtered$split_variable
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable <= cut_point[2],
paste("[", round(cut_point[1], digits = 2),
",", round(cut_point[2], digits = 2), "]",
sep = ""
sep = "_"
for (i in 3:length(cut_point)) {
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable > cut_point[1] &
split_variable > cut_point[i - 1] &
split_variable <= cut_point[i],
round(cut_point[i - 1],
digits = 2
digits = 2
), "]",
sep = ""
sep = "_"
new_strata <- interaction(dplyr::select(
data_filtered, old_strata,
small_df <- cbind(new_strata, data_filtered)
small_df <- dplyr::select(small_df, -split_var_updated)
if (type %in% c("global quantile", "local quantile", "value") &
length(split) > 1) {
if (is.numeric(data$split_variable) == FALSE) {
stop(strwrap("'split_var' must be a column of 'data' holding
numeric values. If you want to split on a categorical variable,
use type = 'categorical'.", prefix = " ", initial = ""))
if (type == "global quantile") {
cut_point_list <- list()
for (i in seq_along(split)) {
cut_point_list[[i]] <- sort(stats::quantile(
data[, "split_variable"],
)) # Find cut points
if (type == "local quantile") {
get_cuts <- function(x) {
data$old_strata == x,
], split_at))
cut_point_list <- lapply(split, get_cuts)
if (type == "value") {
cut_point_list <- list()
for (i in seq_along(split)) {
cut_point_list[[i]] <- sort(split_at)
# Find cut points, which are the same everywhere here.
# Warning if some cut points are outside of limits of some strata
if (any(split_at < min(data[
data$old_strata %in% split,
])) |
any(split_at > max(data[
data$old_strata %in% split,
]))) {
warning(strwrap(paste0("value(s) of 'split_at' are outside of
the range of values in ", split),
prefix = " ", initial = ""
# Now perform splits. Each element of split gets the process
# run over it.
data_filtered_list <- list()
for (j in seq_along(split)) {
if (is.null(split[j]) == FALSE) {
data_filtered <- data %>%
dplyr::filter(old_strata == split[j])
if (is.null(split) == TRUE) { # shouldn't happen here
data_filtered <- data
if (length(cut_point_list[[j]]) == 1) {
cut_point <- cut_point_list[[j]]
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable <= cut_point[1],
paste("[", round(min(data_filtered$split_variable),
digits = 2
), ",",
round(cut_point[1], digits = 2), "]",
sep = ""
sep = "_"
paste("(", round(cut_point[1],
digits = 2
), ",",
digits = 2
), "]",
sep = ""
sep = "_"
if (length(cut_point_list[[j]]) > 1) {
cut_point <- cut_point_list[[j]]
cut_point <- c(
data_filtered$split_var_updated <- data_filtered$split_variable
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable <= cut_point[2],
paste("[", round(cut_point[1], digits = 2), ",",
round(cut_point[2], digits = 2), "]",
sep = ""
sep = "_"
for (i in 3:length(cut_point)) {
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable > cut_point[1] &
split_variable > cut_point[i - 1] &
split_variable <= cut_point[i],
round(cut_point[i - 1],
digits = 2
), ",",
digits = 2
), "]",
sep = ""
sep = "_"
data_filtered_list[[j]] <- data_filtered
data_filtered_df <- dplyr::bind_rows(data_filtered_list)
new_strata <- interaction(dplyr::select(
small_df <- cbind(new_strata, data_filtered_df)
small_df <- dplyr::select(small_df, -split_var_updated)
if (type == "categorical") {
data$split_variable <- as.character(data$split_variable)
if (is.null(split) == TRUE) {
data_filtered <- data
} else {
data_filtered <- dplyr::filter(data, old_strata %in% split)
data_filtered <- data_filtered %>%
split_var_updated =
ifelse(split_variable %in% split_at,
paste0(new_name, "_1"),
paste0(new_name, "_0")
new_strata <- interaction(dplyr::select(
data_filtered <- cbind(new_strata, data_filtered)
small_df <- dplyr::select(data_filtered, -split_var_updated)
if (is.null(split)) {
output_df <- small_df
if (is.null(split) == FALSE & length(split) == 1) {
output_df <- data %>%
dplyr::filter(old_strata != split) %>%
dplyr::mutate(new_strata = old_strata)
output_df <- rbind(output_df, small_df)
if (is.null(split) == FALSE & length(split) > 1) {
output_df <- data %>%
dplyr::filter(old_strata %in% split == FALSE) %>%
dplyr::mutate(new_strata = old_strata)
output_df <- rbind(output_df, small_df)
names(output_df)[names(output_df) == "split_variable"] <- split_var
column_names_other <- names(output_df)[names(output_df) !=
"old_strata" &
names(output_df) !=
column_names_other <- enquo(column_names_other)
output_df <- dplyr::select(
output_df, new_strata,
old_strata, !!column_names_other
if (is.numeric(output_df$new_strata) == FALSE) {
output_df$new_strata <- as.character(output_df$new_strata)
if (is.null(sort_by) == FALSE) {
output_df <- dplyr::arrange(output_df, !!sym(sort_by))
