#' Create universe
#'
#' Along with the data to be weighted, this combines one or more objects of
#' class \code{category} to build the universe with known marginal proportions.
#' It also checks and adjusts the proportions to account for missing values in the data.
#'
#' @param data Data frame containing data where weights are desired.
#' @param ... One or more output objects from \code{category()}.
#' @param N Size of universe. If supplied, expansion factor will be applied
#' to weights after convergence.
#'
#' @importFrom glue glue glue_collapse
#' @importFrom purrr map map_lgl map_chr pluck
#' @importFrom dplyr sym pull mutate select filter count full_join case_when rename %>%
#' @importFrom scales percent
#' @importFrom tibble tibble add_column
#' @importFrom labelled remove_labels
#'
#' @return A \code{list} with special class \code{universe}.
#'
#' @examples
#' data(demo_data)
#'
#' universe(
#' data = demo_data,
#'
#' category(
#' name = "Sex",
#' buckets = factor(
#' x = levels(demo_data[["Sex"]]),
#' levels = levels(demo_data[["Sex"]])
#' ),
#' targets = c(0.4, 0.5),
#' sum.1 = TRUE
#' ),
#'
#' category(
#' name = "BirthYear",
#' buckets = c(1986:1990),
#' targets = rep(0.2, times = 5)
#' ),
#'
#' category(
#' name = "EyeColor",
#' buckets = c("brown", "green", "blue"),
#' targets = c(0.8, 0.1, 0.1)
#' ),
#'
#' category(
#' name = "HomeOwner",
#' buckets = c(TRUE, FALSE),
#' targets = c(3/4, 1/4)
#' )
#' )
#'
#' @export
universe <- function(data, ..., N) {
# up front error checks ---------------------------------------------------
# make sure data frame is supplied
if (!is.data.frame(data)) {
stop("Input to `data` must be a data frame.")
}
# list object of all unspecified arguments passed to function
categories <- list(...)
# make sure at least one category was provided
if (length(categories) < 1) {
stop("Provide at least one weighting category, use `category()` to construct this.")
}
# are all inputs to this function category class?
if (!(all(map_lgl(categories, function(x) "category" %in% class(x))))) {
stop("Each input to `...` must be of the class 'category'. Use `category()` to construct this input.")
}
# make sure N is good
if (!missing(N)) {
if (N <= nrow(data)) {
stop(glue(
"
Input to `N` must be a single numeric value larger than the size of your sample ({nrow(data)}).
"
))
}
} else {
# would this work best for doing the expansion factor calculation?
N <- 1
}
# verify categories specified exist in the data ---------------------------
# Simply check to make sure that the user is not creating categories which
# don't exist in the data to be weighted.
df.names <- names(data)
wgt.cats <- map_chr(categories, pluck, "category")
bad.cats <- wgt.cats[!wgt.cats %in% df.names]
if (length(bad.cats) > 0) {
stop.message <- glue(
"
Each name given to a weighting category in `universe()` must have a matching column name in `data`. The following weighting categories have no match:
",
glue_collapse(bad.cats, sep = ", "),
.sep = "\n"
)
stop(stop.message)
}
# check that all buckets exist in the data --------------------------------
# This will check to make sure that the buckets provided are also in the
# data. One thing to consider here is that there may be missing data in
# `data`, so when running the checks, we need to drop missing values before
# the comparison takes place. The model will be ajusted for NA's in a
# later step.
# this may come in handy later...
drop_na_vec <- function(x) x[!is.na(x)]
num.cats <- length(wgt.cats)
df.unique <- map(data[wgt.cats], unique)
wgt.buckets <- map(categories, pluck, "buckets")
for (i in 1:num.cats) {
# vector class compatibility
df.class <- class(df.unique[[i]])
wgt.class <- class(wgt.buckets[[i]])
classes.match <- df.class == wgt.class
# there will be some coercion happening further down the line
# these are scenarios that should skip the stop bit here
if (!classes.match) {
if (df.class %in% c("numeric", "integer") & wgt.class %in% c("numeric", "integer")) {
classes.match <- TRUE
}
if (df.class %in% c("factor", "character") & wgt.class %in% c("factor", "character")) {
classes.match <- TRUE
}
}
if (!classes.match) {
stop(glue(
"
Mismatch in variable classes for '{wgt.cats[[i]]}' weighting category.
---------------------------------------------------------------------------
Class in data: {df.class}
Class in `category()`: {wgt.class}
---------------------------------------------------------------------------
Please reconcile this difference before proceeding.
"
))
}
# do not allow NA's in bucket
if (any(is.na(wgt.buckets[[i]]))) {
stop(glue(
"
`NA` is not a valid bucket, please review input to `category()` for '{wgt.cats[[i]]}' weighting category.
"
))
}
# opted for `all.equal()` over `identical()` because `identical()`
# got hung up on mismatches between numeric/integer
buckets.match <- isTRUE(all.equal(
# note that sort() drops NA's by default
# as.character eliminates factor level/character sort issues
target = as.character(df.unique[[i]]) %>% sort(),
current = as.character(wgt.buckets[[i]]) %>% sort()
))
if (!buckets.match) {
stop.message <- glue(
"
There are mismatches between the buckets provided, and the unique values of `data` for the '{wgt.cats[[i]]}' weighting category.
---------------------------------------------------------------------------
Unique in data: {glue_collapse(sort(df.unique[[i]]), sep = ', ')}
Unique in `category()`: {glue_collapse(sort(wgt.buckets[[i]]), sep = ', ')}
---------------------------------------------------------------------------
Please reconcile this difference before proceeding.
*Note: Missing values (NA) in the data are acceptable.
"
)
stop(stop.message)
}
}
# adjust the population model ---------------------------------------------
# This works by doing the following steps:
# 1. Find the actual proportions in the data.
# 2. Create a data frame of the target proportions.
# 3. Variable coercion for numeric/integer and factor/character
# 4. Join actual proportions to target full_join(), this ensures that
# correct ordering is maintained, and identifies proportions of NA's
# in the data.
# 5. Check to see if NA's exist in the data, if so, find proportion.
# 6. If NA's exist, adjust the targets by target * (1 - prop NA)
# Output from this process will be stored in a new list
universe <- list()
# This is used to print out the weighting categories which were adjusted
# for missing values
wgt.cats.adj <- character(0)
for (i in 1:num.cats) {
# 1. Find actual proportions in the data
wc <- sym(wgt.cats[[i]])
# remove_labels helps classes match for the full_join
act.prop <-
data %>%
count({{ wc }}) %>%
mutate(act_prop = n / sum(n)) %>%
select(-n) %>%
remove_labels()
# 2. Data frame of targets proportions
targ.prop <- tibble(
{{ wc }} := categories[[i]][["buckets"]],
targ_prop = categories[[i]][["targ_prop"]]
)
# 3. Check for differing classes - coerce some scenarios if needed for later full_join
class.act <- class(act.prop[[wgt.cats[i]]])
class.tar <- class(targ.prop[[wgt.cats[i]]])
if (class.act != class.tar) {
# force target class to match actual/data class - numeric/integer
if (class.act %in% c("numeric", "integer") & class.tar %in% c("numeric", "integer")) {
if (class.act == "numeric") {
targ.prop[[wgt.cats[i]]] <- as.numeric(targ.prop[[wgt.cats[i]]])
} else if (class.act == "integer") {
targ.prop[[wgt.cats[i]]] <- as.integer(targ.prop[[wgt.cats[i]]])
}
# force target class to match actual/data class - factor/character
} else if (class.act %in% c("factor", "character") & class.tar %in% c("factor", "character")) {
if (class.act == "factor") {
targ.prop[[wgt.cats[i]]] <- factor(
x = targ.prop[[wgt.cats[i]]],
levels = levels(act.prop[[wgt.cats[i]]])
)
} else if (class.act == "character") {
targ.prop[[wgt.cats[i]]] <- as.character(targ.prop[[wgt.cats[i]]])
}
}
}
# 4. Join target to actual
all.prop <- full_join(
targ.prop,
act.prop,
by = wgt.cats[[i]]
)
# 5. Find proportion missing
if (any(is.na(all.prop))) {
p.na <-
all.prop %>%
filter(is.na({{ wc }})) %>%
pull(act_prop)
# collect category name for printing
wgt.cats.adj <- c(
wgt.cats.adj,
glue("{wgt.cats[[i]]} ({scales::percent(p.na, accuracy = 0.1)})")
)
# 6. Adjust target proportions to account for p NA
all.prop <-
all.prop %>%
mutate(targ_prop = case_when(
is.na(targ_prop) ~ p.na,
TRUE ~ targ_prop * (1 - p.na)
))
}
# Add finalized data frame to the output list
universe[[i]] <-
all.prop %>%
rename("bucket" = 1) %>%
add_column(category = wgt.cats[[i]], .before = 1)
}
if (length(wgt.cats.adj) > 0) {
warning(glue(
"
Missing data was found in the following categories:
---------------------------------------------------------------------------
{glue_collapse(wgt.cats.adj, sep = '\n')}
---------------------------------------------------------------------------
Target proportions have been reproportioned to account for missing data.
"
))
}
# final return ------------------------------------------------------------
names(universe) <- wgt.cats
out <- list(
data = data,
universe = universe,
N = N
)
class(out) <- c(class(universe), "universe")
return(out)
}
utils::globalVariables(c("category", "n"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.