# R/optimum_allocation.R In optimall: Allocate Samples Among Strata

#### Documented in optimum_allocation

#' Optimum Allocation
#'
#' Determines the optimum sampling fraction and sample size for
#' each stratum in a stratified random sample, which
#' minimizes the variance of the sample mean according to Neyman
#' Allocation or Exact Optimum Sample Allocation (Wright 2014).
#' @param data A data frame or matrix with at least one column specifying
#' each unit's stratum, and either 1) a second column holding the value of the
#' continuous variable for which the sample mean variance should be minimized
#' (\code{y}) or 2) two columns: one holding the the within-stratum
#' standard deviation for the variable of interest (\code{sd_h}) and another
#' holding the stratum sample sizes (\code{N_h}).
#' If \code{data} contains a column \code{y} holding values for
#' the variable of interest, then \code{data} should have one row for
#' each sampled unit. If \code{data} holds \code{sd_h} and \code{N_h}, the
#' within-stratum standard deviations and population sizes, then \code{data}
#' should have one row per stratum.
#' Other columns are allowed but will be ignored.
#' @param strata a character string or vector of character strings
#' specifying the name(s) of columns which specify the stratum
#' that each unit belongs to. If multiple column names are
#' provided, each unique combination of values in these columns
#' is taken to define one stratum.
#' @param y a character string specifying the name of the
#' continuous variable for which the variance should be minimized.
#' Defaults to \code{NULL} and should be left as \code{NULL} when \code{data}
#' holds stratum standard deviations and sample sizes instead of individual
#' sampling units.
#' @param sd_h a character string specifying the name of the
#' column holding the within-stratum standard deviations for each stratum.
#' Defaults to \code{NULL} and should be left as \code{NULL} when \code{data}
#' holds individual sampling units.
#' @param N_h a character string specifying the name of the
#' column holding the population stratum sizes for each stratum.
#' Defaults to \code{NULL} and should be left as \code{NULL} when \code{data}
#' holds individual sampling units.
#' @param nsample the desired total sample size. Defaults to \code{NULL}.
#' @param method a character string specifying the method of
#' optimum sample allocation to use. Must be one of:
#' \itemize{
#' \item \code{"WrightII"}, the default, uses Algorithm II from
#' Wright (2014) to determine the optimum allocation of a fixed
#' sample size across the strata. It requires that at least two
#' samples are allocated to each stratum.
#' \item \code{"WrightI"} uses Wright's Algorithm I to determine
#' the optimum sample allocation. It only requires that at least
#' one sample is allocated to each stratum, and can therefore
#' lead to a biased variance estimate.
#' \item \code{"Neyman"} uses the standard method of Neyman
#' Allocation to determine the optimum sample allocation. When
#' \code{nsample = NULL}, the optimal sampling fraction is calculated
#' and returned. When a numeric value is specified for \code{nsample},
#' then the number allocated to each stratum is the optimal sampling
#' fraction times \code{nsample} rounded to the nearest integer,
#' which may no longer be optimall.
#' }
#' @param ndigits a numeric value specifying the number of digits
#' to which the standard deviation and stratum fraction should be rounded.
#' Defaults to 2.
#' @param allow.na logical input specifying whether y should
#' be allowed to have NA values. Defaults to \code{FALSE}.
#' @examples
#' optimum_allocation(
#'   data = iris, strata = "Species", y = "Sepal.Length",
#'   nsample = 40, method = "WrightII"
#' )
#'
#' # Or if input data is summary of strata sd and N:
#' iris_summary <- data.frame(
#'   strata = unique(iris$Species), #' size = c(50, 50, 50), #' sd = c(0.3791, 0.3138, 0.3225) #' ) #' #' optimum_allocation( #' data = iris_summary, strata = "strata", #' sd_h = "sd", N_h = "size", #' nsample = 40, method = "WrightII" #' ) #' @export #' @references Wright, T. (2014). A Simple Method of Exact Optimal #' Sample Allocation under Stratification with any Mixed #' Constraint Patterns, Research Report Series (Statistics #2014-07), #' Center for Statistical Research and Methodology, U.S. Bureau #' of the Census, Washington, D.C. #' @return Returns a data frame with the number of samples allocated to each #' stratum, or just the sampling fractions if nsample is NULL. #' @importFrom magrittr %>% optimum_allocation <- function(data, strata, y = NULL, sd_h = NULL, N_h = NULL, nsample = NULL, ndigits = 2, method = c("WrightII", "WrightI", "Neyman"), allow.na = FALSE) { n_sd <- sd <- n <- npop <- stratum_fraction <- NULL # bind local vars as necessary if (is.matrix(data) | tibble::is_tibble(data)) { data <- data.frame(data) } if (is.data.frame(data) == FALSE) { stop("Input data must be a dataframe or matrix with named columns.") } if (all(strata %in% names(data)) == FALSE) { stop("'Strata' must be a string or vector of strings matching column names of data.") } if (anyNA(data[, strata])) { stop("Columns specifying strata contain NAs") } if ((is.null(y) & is.null(sd_h)) | (!is.null(y) & !is.null(sd_h))) { stop("One and only one of 'y' or 'sd_h' must be provided. If 'sd_h' is provided, then 'N_h' must be as well. Use ?optimum_allocation for help") } # Start if y is given and sd is NULL if (is.null(sd_h)) { if (!is.null(N_h)) { stop("If 'sd_h' is NULL, 'N_h' should also be NULL.") } if (y %in% names(data) == FALSE) { stop("'y' must be a character string matching a column name of data.") } if (is.numeric(data[, y]) == FALSE) { stop("'y' must be numeric.") } method <- match.arg(method) y <- enquo(y) strata <- enquo(strata) output_df <- data %>% dplyr::select(!!strata, !!y) strata <- interaction(dplyr::select(output_df, -!!y)) output_df <- cbind(strata, output_df) output_df <- output_df[, c(1, ncol(output_df))] # Only columns of interest names(output_df) <- c("strata", "y") if (allow.na == FALSE & sum(is.na(output_df)) >= 1) { stop("Data contains NAs. If this is intentional, set allow.na to TRUE.") } if (min(dplyr::count( dplyr::filter(output_df, !(is.na(y))), strata )[, "n"]) < 2 | length(unique(dplyr::filter(output_df, !(is.na(y)))$strata)) !=
length(unique(output_df$strata))) { stop("Function requires at least two observations per stratum") } if (method == "Neyman") { output_df <- output_df %>% dplyr::group_by(strata) %>% dplyr::summarize( n = dplyr::n(), sd = stats::sd(y, na.rm = TRUE), n_sd = stats::sd(y, na.rm = TRUE) * dplyr::n() ) %>% dplyr::mutate( stratum_fraction = round(n_sd / sum(n_sd), digits = ndigits ), sd = round(sd, digits = ndigits), n_sd = round(n_sd, digits = ndigits) ) output_df <- (as.data.frame(output_df)) names(output_df)[names(output_df) == "n"] <- "npop" if (is.null(nsample)) { output_df <- dplyr::arrange(output_df, strata) return(output_df) } else { output_df <- output_df %>% dplyr::mutate( stratum_size = round(nsample * n_sd / sum(n_sd), digits = 0 ), sd = round(sd, digits = ndigits), n_sd = round(n_sd, digits = ndigits) ) output_df <- dplyr::arrange(output_df, strata) return(output_df) } } else if (method == "WrightI") { if (is.null(nsample)) { stop("This method requires a fixed nsample. Try method = 'Neyman' for exact sampling fractions.") } else { n_strata <- length(unique(strata)) n_minus_H <- nsample - n_strata if (n_minus_H <= 0) { stop("'nsample' is too small for this method.") } output_df <- output_df %>% dplyr::group_by(strata) %>% dplyr::summarize( n = dplyr::n(), sd = stats::sd(y, na.rm = TRUE), n_sd = stats::sd(y, na.rm = TRUE) * dplyr::n() ) if (nsample > sum(output_df$n)) {
stop("'nsample' is larger than population size")
}
priority_array <- list()
for (i in 1:n_strata) {
priority_array[[i]] <- c(
rep(output_df[i, "n_sd"],
times = min(output_df[i, "n"] - 1, n_minus_H)
),
rep(NULL,
times = ifelse(n_minus_H > (output_df[i, "n"] - 1),
n_minus_H - (output_df[i, "n"] - 1),
0
)
)
)
# All rows are same length, but zeroes so that n_sample
# won't be larger than n_strata
names(priority_array)[[i]] <- paste0("n_sd", as.character(i))
}
suppressMessages(Wright_output <- dplyr::bind_rows(priority_array))
Wright_output[is.na(Wright_output)] <- 0
mult_vec <- vector()
for (i in 1:(n_minus_H + 1)) {
mult_vec[i] <- 1 / (sqrt(i * (i + 1)))
}
for (i in seq_len(ncol(Wright_output))) {
Wright_output[, i] <- Wright_output[, i] * mult_vec[i]
}
cutoff <- (sort(unlist(Wright_output, use.names = FALSE),
decreasing = TRUE
))[n_minus_H]
stratum_size <- rowSums(Wright_output >= cutoff) + 1
final_output <- cbind(
output_df[, c("strata", "n", "sd", "n_sd")], stratum_size
)
final_output <- final_output %>%
dplyr::mutate(
stratum_fraction = round(stratum_size / nsample,
digits = ndigits
),
sd = round(sd, digits = ndigits),
n_sd = round(n_sd, digits = ndigits)
)
final_output <- final_output[c(
"strata", "n", "sd", "n_sd", "stratum_fraction",
"stratum_size"
)]
names(final_output)[names(final_output) == "n"] <- "npop"
final_output <- dplyr::arrange(final_output, strata)
return(final_output)
}
}
else if (method == "WrightII") {
if (is.null(nsample)) {
stop("This method requires a fixed nsample. Try method =
'Neyman' for exact sampling fractions.")
}
else {
n_strata <- length(unique(strata))
n_minus_2H <- nsample - 2 * n_strata
if (n_minus_2H <= 0) {
stop("'nsample' is too small for this method.")
}
output_df <- output_df %>%
dplyr::group_by(strata) %>%
dplyr::summarize(
n = dplyr::n(),
sd = stats::sd(y, na.rm = TRUE),
n_sd = stats::sd(y, na.rm = TRUE) * dplyr::n()
)
if (nsample > sum(output_df$n)) { stop("'nsample' is larger than population size") } priority_array <- list() for (i in 1:n_strata) { priority_array[[i]] <- c( rep(output_df[i, "n_sd"], times = min(output_df[i, "n"] - 2, n_minus_2H) ), rep(output_df[i, "n_sd"] * 0, times = ifelse(n_minus_2H > (output_df[i, "n"] - 2), n_minus_2H - (output_df[i, "n"] - 2), 0 ) ) ) # All rows are same length, but zeroes so that n_sample # won't be larger than n_strata. Zero instead of NULL so # entries in list aren't empty when n=2. names(priority_array)[[i]] <- paste0("n_sd", as.character(i)) } suppressMessages(Wright_output <- dplyr::bind_rows(priority_array)) Wright_output[is.na(Wright_output)] <- 0 mult_vec <- vector() for (i in 2:(n_minus_2H + 1)) { mult_vec[i - 1] <- 1 / (sqrt(i * (i + 1))) } for (i in seq_len(ncol(Wright_output))) { Wright_output[, i] <- Wright_output[, i] * mult_vec[i] } cutoff <- (sort(unlist(Wright_output, use.names = FALSE), decreasing = TRUE ))[n_minus_2H] stratum_size <- rowSums(Wright_output >= cutoff) + 2 final_output <- cbind( output_df[, c("strata", "n", "sd", "n_sd")], stratum_size ) final_output <- final_output %>% dplyr::mutate( stratum_fraction = round(stratum_size / nsample, digits = ndigits ), sd = round(sd, digits = ndigits), n_sd = round(n_sd, digits = ndigits) ) final_output <- final_output[ c( "strata", "n", "sd", "n_sd", "stratum_fraction", "stratum_size" ) ] names(final_output)[names(final_output) == "n"] <- "npop" final_output <- dplyr::arrange(final_output, strata) return(final_output) } } # End if y is given and sd/N is NULL } # Start if sd/N is given and y is NULL if (is.null(y)) { if (sd_h %in% names(data) == FALSE | N_h %in% names(data) == FALSE) { stop("'sd_h' and 'N_h' must be character strings matching column names of 'data'.") } if (is.numeric(data[, sd_h]) == FALSE | is.numeric(data[, N_h]) == FALSE) { stop("'sd_h' and 'N_h' must be numeric.") } method <- match.arg(method) sd_h <- enquo(sd_h) N_h <- enquo(N_h) strata <- enquo(strata) output_df <- data %>% dplyr::select(!!strata, !!N_h, !!sd_h) strata <- interaction(dplyr::select(output_df, -!!N_h, -!!sd_h)) output_df <- cbind(strata, output_df) output_df <- output_df[, c(1, ncol(output_df) - 1, ncol(output_df))] # Only columns of interest names(output_df) <- c("strata", "N_h", "sd_h") if (any(table(output_df$strata) > 1)) {
stop("If using 'sd_h' and 'N_h' instead of 'y',
data must only contain one row per stratum.")
}
if (sum(is.na(output_df)) >= 1) {
stop("Data cannot contain NAs.")
}
if (method == "Neyman") {
output_df <- output_df %>%
dplyr::mutate(
n_sd = N_h * sd_h
) %>%
dplyr::mutate(
stratum_fraction = round(n_sd / sum(n_sd),
digits = ndigits
),
sd = round(sd_h, digits = ndigits),
n = N_h,
n_sd = round(n_sd, digits = ndigits)
)
output_df <- (as.data.frame(output_df))
names(output_df)[names(output_df) == "n"] <- "npop"
if (is.null(nsample)) {
output_df <- dplyr::arrange(output_df, strata)
output_df <- dplyr::select(
output_df,
strata, npop, sd, n_sd,
stratum_fraction
)
return(output_df)
}
else {
output_df <- output_df %>%
dplyr::mutate(
stratum_size = round(nsample * n_sd / sum(n_sd),
digits = 0
),
sd = round(sd, digits = ndigits),
n_sd = round(n_sd, digits = ndigits)
)
output_df <- dplyr::arrange(output_df, strata)
return(output_df)
}
}
else if (method == "WrightI") {
if (is.null(nsample)) {
stop("This method requires a fixed nsample. Try method =
'Neyman' for exact sampling fractions.")
}
else {
n_strata <- length(unique(strata))
n_minus_H <- nsample - n_strata
if (n_minus_H <= 0) {
stop("'nsample' is too small for this method.")
}
output_df <- output_df %>%
dplyr::mutate(
n = N_h,
sd = sd_h,
n_sd = n * sd
)
output_df <- tibble::as_tibble(dplyr::select(
output_df,
strata, n, sd, n_sd
))
if (nsample > sum(output_df$n)) { stop("'nsample' is larger than population size") } priority_array <- list() for (i in 1:n_strata) { priority_array[[i]] <- c( rep(output_df[i, "n_sd"], times = min(output_df[i, "n"] - 1, n_minus_H) ), rep(NULL, times = ifelse(n_minus_H > (output_df[i, "n"] - 1), n_minus_H - (output_df[i, "n"] - 1), 0 ) ) ) # All rows are same length, but zeroes so that n_sample # won't be larger than n_strata names(priority_array)[[i]] <- paste0("n_sd", as.character(i)) } suppressMessages(Wright_output <- dplyr::bind_rows(priority_array)) Wright_output[is.na(Wright_output)] <- 0 mult_vec <- vector() for (i in 1:(n_minus_H + 1)) { mult_vec[i] <- 1 / (sqrt(i * (i + 1))) } for (i in seq_len(ncol(Wright_output))) { Wright_output[, i] <- Wright_output[, i] * mult_vec[i] } cutoff <- (sort(unlist(Wright_output, use.names = FALSE), decreasing = TRUE ))[n_minus_H] stratum_size <- rowSums(Wright_output >= cutoff) + 1 final_output <- cbind( output_df[, c("strata", "n", "sd", "n_sd")], stratum_size ) final_output <- final_output %>% dplyr::mutate( stratum_fraction = round(stratum_size / nsample, digits = ndigits ), sd = round(sd, digits = ndigits), n_sd = round(n_sd, digits = ndigits) ) final_output <- final_output[c( "strata", "n", "sd", "n_sd", "stratum_fraction", "stratum_size" )] names(final_output)[names(final_output) == "n"] <- "npop" final_output <- dplyr::arrange(final_output, strata) return(final_output) } } else if (method == "WrightII") { if (is.null(nsample)) { stop("This method requires a fixed nsample. Try method = 'Neyman' for exact sampling fractions.") } else { n_strata <- length(unique(strata)) n_minus_2H <- nsample - 2 * n_strata if (n_minus_2H <= 0) { stop("'nsample' is too small for this method.") } output_df <- output_df %>% dplyr::mutate( n = N_h, sd = sd_h, n_sd = n * sd ) output_df <- tibble::as_tibble(dplyr::select( output_df, strata, n, sd, n_sd )) if (nsample > sum(output_df$n)) {
stop("'nsample' is larger than population size")
}
priority_array <- list()
for (i in 1:n_strata) {
priority_array[[i]] <- c(
rep(output_df[i, "n_sd"],
times = min(output_df[i, "n"] - 2, n_minus_2H)
),
rep(output_df[i, "n_sd"] * 0,
times = ifelse(n_minus_2H > (output_df[i, "n"] - 2),
n_minus_2H - (output_df[i, "n"] - 2),
0
)
)
)
# All rows are same length, but zeroes so that n_sample
# won't be larger than n_strata. Zero instead of NULL so
# entries in list aren't empty when n=2.
names(priority_array)[[i]] <- paste0("n_sd", as.character(i))
}
suppressMessages(Wright_output <- dplyr::bind_rows(priority_array))
Wright_output[is.na(Wright_output)] <- 0
mult_vec <- vector()
for (i in 2:(n_minus_2H + 1)) {
mult_vec[i - 1] <- 1 / (sqrt(i * (i + 1)))
}
for (i in seq_len(ncol(Wright_output))) {
Wright_output[, i] <- Wright_output[, i] * mult_vec[i]
}
cutoff <- (sort(unlist(Wright_output, use.names = FALSE),
decreasing = TRUE
))[n_minus_2H]
stratum_size <- rowSums(Wright_output >= cutoff) + 2
final_output <- cbind(
output_df[, c("strata", "n", "sd", "n_sd")], stratum_size
)
final_output <- final_output %>%
dplyr::mutate(
stratum_fraction = round(stratum_size / nsample,
digits = ndigits
),
sd = round(sd, digits = ndigits),
n_sd = round(n_sd, digits = ndigits)
)
final_output <- final_output[
c(
"strata", "n", "sd", "n_sd", "stratum_fraction",
"stratum_size"
)
]
names(final_output)[names(final_output) == "n"] <- "npop"
final_output <- dplyr::arrange(final_output, strata)
return(final_output)
}
}
# End if sd is given and y is NULL
}
}


## Try the optimall package in your browser

Any scripts or data that you put into this service are public.

optimall documentation built on June 22, 2024, 9:34 a.m.