#' Apply post-stratification to classifiers.
#'
#' @inheritParams auto_MrP
#' @param best.subset.opt Optimal tuning parameters from best subset selection
#' classifier. A list returned by \code{run_best_subset()}.
#' @param lasso.opt Optimal tuning parameters from lasso classifier A list
#' returned by \code{run_lasso()}.
#' @param pca.opt Optimal tuning parameters from best subset selection with
#' principal components classifier A list returned by \code{run_pca()}.
#' @param gb.opt Optimal tuning parameters from gradient tree boosting
#' classifier A list returned by \code{run_gb()}.
#' @param svm.opt Optimal tuning parameters from support vector machine
#' classifier A list returned by \code{run_svm()}.
#' @param mrp.include Whether to run MRP classifier. A logical argument
#' indicating whether the standard MRP classifier should be used for
#' predicting outcome \code{y}. Passed from \code{autoMrP()} argument
#' \code{mrp}.
#' @param n.minobsinnode GB minimum number of observations in the terminal
#' nodes. An integer-valued scalar specifying the minimum number of
#' observations that each terminal node of the trees must contain. Passed from
#' \code{autoMrP()} argument \code{gb.n.minobsinnode}.
#' @param L2.unit.include GB L2.unit. A logical argument indicating whether
#' \code{L2.unit} should be included in the GB classifier. Passed from
#' \code{autoMrP()} argument \code{gb.L2.unit}.
#' @param L2.reg.include A logical argument indicating whether \code{L2.reg}
#' should be included in the GB classifier. Passed from \code{autoMrP()}
#' argument \code{GB L2.reg}.
#' @param kernel SVM kernel. A character-valued scalar specifying the kernel to
#' be used by SVM. The possible values are \code{linear}, \code{polynomial},
#' \code{radial}, and \code{sigmoid}. Passed from \code{autoMrP()} argument
#' \code{svm.kernel}.
#' @param data A data.frame containing the survey data used in classifier
#' training.
#' @param ebma.fold A data.frame containing the data not used in classifier
#' training.
post_stratification <- function(
y, L1.x, L2.x, L2.unit, L2.reg, best.subset.opt, lasso.opt, lasso.L2.x,
pca.opt, gb.opt, svm.opt, svm.L2.reg, svm.L2.unit, svm.L2.x, mrp.include,
n.minobsinnode, L2.unit.include, L2.reg.include, kernel, mrp.L2.x,
data, ebma.fold, census, verbose, deep.mrp, deep.splines
) {
# globals
lasso <- NULL
pca <- NULL
gb <- NULL
svm <- NULL
mrp <- NULL
best_subset <- NULL
# Copy L2.unit b/c it is needed twice but might be reset depending on call
L2_unit <- L2.unit
# post-stratification without level 2 variables
if (all(L2.x == "")) {
L2.x <- NULL
}
# model container for EBMA
models <- list()
# data that is used for models that will not enter EBMA
no_ebma_data <- dplyr::bind_rows(data, ebma.fold)
# remove missing values
data <- tidyr::drop_na(
data = data, dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))
)
no_ebma_data <- tidyr::drop_na(
data = no_ebma_data, dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))
)
# Determine type of dependent variable
if (
no_ebma_data %>%
dplyr::pull(!!y) %>%
unique() %>%
length() == 2
) {
dv_type <- "binary"
} else {
dv_type <- "linear"
}
# ----- Fit optimal model and make prediction for individual classifiers -----
# Classifier 1: Best Subset
if (!is.null(best.subset.opt)) {
# deep MrP or not
if (deep.mrp) {
# Fit optimal model for EBMA
best_subset_opt_ebma <- deep_mrp_classifier(
form = best.subset.opt,
y = y,
data = data,
verbose = TRUE
)
# Fit optimal model for post-stratification w/o EBMA
best_subset_opt_poststrat_only <- deep_mrp_classifier(
form = best.subset.opt,
y = y,
data = no_ebma_data,
verbose = TRUE
)
if (dv_type == "binary") {
# post-stratification
bs_preds <- census %>%
dplyr::mutate(
best_subset = vglmer::predict_MAVB(
samples = 1000,
best_subset_opt_poststrat_only,
newdata = census,
allow_missing_levels = TRUE
)[["mean"]] %>%
stats::plogis()
) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(best_subset = stats::weighted.mean(
x = best_subset, w = prop
), .groups = "keep") %>%
dplyr::ungroup()
# individual level predictions for EBMA
bs_ind <- vglmer::predict_MAVB(
samples = 1000,
best_subset_opt_ebma,
newdata = data,
allow_missing_levels = TRUE
)[["mean"]] %>%
stats::plogis()
# model for EBMA
models$best_subset <- best_subset_opt_ebma
} else {
# post-stratification
bs_preds <- census %>%
dplyr::mutate(
best_subset = vglmer::predict_MAVB(
samples = 1000,
best_subset_opt_poststrat_only,
newdata = census,
allow_missing_levels = TRUE
)[["mean"]]
) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(best_subset = stats::weighted.mean(
x = best_subset, w = prop
), .groups = "keep") %>%
dplyr::ungroup()
# individual level predictions for EBMA
bs_ind <- vglmer::predict_MAVB(
samples = 1000,
best_subset_opt_ebma,
newdata = data,
allow_missing_levels = TRUE
)[["mean"]]
# model for EBMA
models$best_subset <- best_subset_opt_ebma
}
} else {
# Fit optimal model for EBMA
best_subset_opt_ebma <- best_subset_classifier(
y = y,
model = best.subset.opt,
data.train = data,
model.family = binomial(link = "probit"),
model.optimizer = "bobyqa",
n.iter = 1000000,
verbose = verbose
)
# Fit optimal model for post-stratification w/o EBMA
best_subset_opt_poststrat_only <- best_subset_classifier(
y = y,
model = best.subset.opt,
data.train = no_ebma_data,
model.family = binomial(link = "probit"),
model.optimizer = "bobyqa",
n.iter = 1000000,
verbose = verbose
)
# post-stratification
bs_preds <- census %>%
dplyr::mutate(
best_subset = stats::predict(
object = best_subset_opt_poststrat_only,
newdata = .,
allow.new.levels = TRUE,
type = "response"
)
) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(best_subset = stats::weighted.mean(
x = best_subset, w = prop
), .groups = "keep") %>%
dplyr::ungroup()
# individual level predictions for EBMA
bs_ind <- stats::predict(object = best_subset_opt_ebma, type = "response")
# model for EBMA
models$best_subset <- best_subset_opt_ebma
}
}
# Classifier 2: Lasso
if (!is.null(lasso.opt)) {
# Determine context-level covariates
if (is.null(lasso.L2.x)) {
lasso.L2.x <- L2.x
}
# Context-level fixed effects
L2_fe <- paste(lasso.L2.x, collapse = " + ")
L2_fe_form <- as.formula(paste(y, " ~ ", L2_fe, sep = ""))
# Individual-level random effects as named list
L1_re <- setNames(
as.list(
rep(
c(~ 1),
times = length(
c(L1.x, L2.unit, L2.reg)
)
)
), c(L1.x, L2.unit, L2.reg)
)
# Fit optimal model for EBMA
lasso_opt_ebma <- lasso_classifier(
y = y,
L2.fix = L2_fe_form,
L1.re = L1_re,
data.train = data,
lambda = lasso.opt,
model.family = binomial(link = "probit"),
verbose = verbose
)
# Fit optimal model for post-stratification w/o EBMA
lasso_opt_poststrat_only <- lasso_classifier(
y = y,
L2.fix = L2_fe_form,
L1.re = L1_re,
data.train = no_ebma_data,
lambda = lasso.opt,
model.family = binomial(link = "probit"),
verbose = verbose
)
# predictions for post-stratification only (no EBMA)
lasso_ests <- predict_glmmLasso(
m = lasso_opt_poststrat_only,
lasso.L2.x = lasso.L2.x,
L2.unit = L2.unit,
L2.reg = L2.reg,
L1.x = L1.x,
census = census
)
# post-stratification
lasso_preds <- census %>%
dplyr::mutate(lasso = lasso_ests) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(
lasso = stats::weighted.mean(x = lasso, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
lasso_ind <- stats::predict(object = lasso_opt_ebma, type = "response")
# model for EBMA
models$lasso <- lasso_opt_ebma
}
# Classifier 3: PCA
if (!is.null(pca.opt)) {
# deep MrP or not
if (deep.mrp) {
# Fit optimal model for EBMA
pca_opt_ebma <- deep_mrp_classifier(
form = pca.opt,
y = y,
data = data,
verbose = TRUE
)
# Fit optimal model for for post-stratification w/o EBMA
pca_opt_poststrat_only <- deep_mrp_classifier(
form = pca.opt,
y = y,
data = no_ebma_data,
verbose = TRUE
)
# dependent variable type
if (dv_type == "binary") {
# post-stratification
pca_preds <- census %>%
dplyr::mutate(
pca = vglmer::predict_MAVB(
samples = 1000,
pca_opt_poststrat_only,
newdata = census,
allow_missing_levels = TRUE
)[["mean"]] %>%
stats::plogis()
) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(
pca = stats::weighted.mean(x = pca, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
pca_ind <- vglmer::predict_MAVB(
samples = 1000,
pca_opt_ebma,
newdata = data,
allow_missing_levels = TRUE
)[["mean"]] %>%
stats::plogis()
} else {
# post-stratification
pca_preds <- census %>%
dplyr::mutate(
pca = vglmer::predict_MAVB(
samples = 1000,
pca_opt_poststrat_only,
newdata = census,
allow_missing_levels = TRUE
)[["mean"]]
) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(
pca = stats::weighted.mean(x = pca, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
pca_ind <- vglmer::predict_MAVB(
samples = 1000,
pca_opt_ebma,
newdata = data,
allow_missing_levels = TRUE
)[["mean"]]
}
# model for EBMA
models$pca <- pca_opt_ebma
} else {
# Fit optimal model for EBMA
pca_opt_ebma <- best_subset_classifier(
y = y,
model = pca.opt,
data.train = data,
model.family = binomial(link = "probit"),
model.optimizer = "bobyqa",
n.iter = 1000000,
verbose = verbose
)
# Fit optimal model for for post-stratification w/o EBMA
pca_opt_poststrat_only <- best_subset_classifier(
y = y,
model = pca.opt,
data.train = no_ebma_data,
model.family = binomial(link = "probit"),
model.optimizer = "bobyqa",
n.iter = 1000000,
verbose = verbose
)
# post-stratification
pca_preds <- census %>%
dplyr::mutate(pca = stats::predict(
object = pca_opt_poststrat_only,
newdata = .,
allow.new.levels = TRUE,
type = "response"
)) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(
pca = stats::weighted.mean(x = pca, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
pca_ind <- stats::predict(object = pca_opt_ebma, type = "response")
# model for EBMA
models$pca <- pca_opt_ebma
}
}
# Classifier 4: GB
if (!is.null(gb.opt)) {
# Evaluate inclusion of L2.unit
if (isTRUE(L2.unit.include == TRUE)) {
L2.unit.gb <- L2.unit
} else {
L2.unit.gb <- NULL
}
# Evaluate inclusion of L2.reg
if (isTRUE(L2.reg.include == TRUE)) {
L2.reg.gb <- L2.reg
} else {
L2.reg.gb <- NULL
}
# Create model formula
x <- paste(c(L1.x, L2.x, L2.unit.gb, L2.reg.gb), collapse = " + ")
form_gb <- as.formula(paste(y, " ~ ", x, sep = ""))
# Fit optimal model for EBMA
gb_opt_ebma <- gb_classifier(
y = y,
form = form_gb,
distribution = "bernoulli",
data.train = data,
n.trees = gb.opt$n_trees,
interaction.depth = gb.opt$interaction_depth,
n.minobsinnode = n.minobsinnode,
shrinkage = gb.opt$shrinkage,
verbose = verbose
)
# Fit optimal model for for post-stratification w/o EBMA
gb_opt_poststrat_only <- gb_classifier(
y = y,
form = form_gb,
distribution = "bernoulli",
data.train = no_ebma_data,
n.trees = gb.opt$n_trees,
interaction.depth = gb.opt$interaction_depth,
n.minobsinnode = n.minobsinnode,
shrinkage = gb.opt$shrinkage,
verbose = verbose
)
# post-stratification
gb_preds <- census %>%
dplyr::mutate(gb = gbm::predict.gbm(
object = gb_opt_poststrat_only,
newdata = .,
n.trees = gb.opt$n_trees,
type = "response"
)) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(
gb = stats::weighted.mean(x = gb, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
gb_ind <- gbm::predict.gbm(
object = gb_opt_ebma, n.trees = gb.opt$n_trees, type = "response"
)
# model for EBMA
models$gb <- gb_opt_ebma
}
# Classifier 5: SVM
if (!is.null(svm.opt)) {
# Determine context-level covariates
if (is.null(svm.L2.x)) {
svm.L2.x <- L2.x
}
# Evaluate inclusion of L2.unit in GB
if (isTRUE(svm.L2.unit)) {
svm.L2.unit <- L2.unit
} else {
svm.L2.unit <- NULL
}
# Evaluate inclusion of L2.reg in GB
if (isTRUE(svm.L2.reg)) {
svm.L2.reg <- L2.reg
} else {
svm.L2.reg <- NULL
}
# Prepare data
svm_data <- data %>%
dplyr::mutate_at(.vars = y, .funs = list(y_svm = ~as.factor(.))) %>%
dplyr::select(
y_svm, dplyr::all_of(c(y, L1.x, svm.L2.x, L2.unit, L2.reg))
)
svm_data_no_ebma <- data %>%
dplyr::bind_rows(ebma.fold) %>%
dplyr::mutate_at(.vars = y, .funs = list(y_svm = ~as.factor(.))) %>%
dplyr::select(
y_svm, dplyr::all_of(c(y, L1.x, svm.L2.x, L2.unit, L2.reg))
)
# Create model formula
x <- paste(c(L1.x, svm.L2.x, svm.L2.unit, svm.L2.reg), collapse = " + ")
form_svm <- as.formula(paste("y_svm ~ ", x, sep = ""))
# Fit optimal model for EBMA
svm_opt_ebma <- svm_classifier(
y = "y_svm",
form = form_svm,
data = svm_data,
kernel = svm.opt$kernel,
type = "C-classification",
probability = TRUE,
svm.gamma = svm.opt$gamma,
svm.cost = svm.opt$cost,
verbose = verbose
)
# Fit optimal model for post-stratification w/o EBMA
svm_opt_poststrat_only <- svm_classifier(
y = "y_svm",
form = form_svm,
data = svm_data_no_ebma,
kernel = kernel,
type = "C-classification",
probability = TRUE,
svm.gamma = svm.opt$gamma,
svm.cost = svm.opt$cost,
verbose = verbose
)
# post-stratification step 1: only L2 units in training data
svm_preds <- census %>%
dplyr::filter(
dplyr::pull(.data = census, !!L2.unit) %in%
dplyr::pull(.data = svm_data_no_ebma, !!L2.unit)
)
# post-stratification step 2: predictions depending the DV
if (
svm_data_no_ebma %>%
dplyr::pull(var = "y_svm") %>%
unique() %>%
length() > 2
) {
# continuous DV
svm_preds <- svm_preds %>%
dplyr::mutate(
svm = stats::predict(
object = svm_opt_poststrat_only,
newdata = .,
allow.new.levels = TRUE,
type = "response"
)
)
} else {
# binary DV
svm_preds <- svm_preds %>%
dplyr::mutate(
svm = attr(stats::predict(
object = svm_opt_poststrat_only,
newdata = .,
allow.new.levels = TRUE,
probability = TRUE
), "probabilities")[, "1"]
)
}
# post-stratification step 3: weighted mean
svm_preds <- svm_preds %>%
dplyr::group_by(!!rlang::sym(L2.unit)) %>%
dplyr::summarize(
svm = stats::weighted.mean(x = svm, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
if (
svm_data_no_ebma %>%
dplyr::pull(var = "y_svm") %>%
unique() %>%
length() > 2
) {
svm_ind <- stats::predict(
object = svm_opt_ebma,
newdata = svm_data,
type = "response"
)
} else {
svm_ind <- attr(stats::predict(
object = svm_opt_ebma,
newdata = svm_data,
probability = TRUE
), "probabilities")[, "1"]
}
# model for EBMA
models$svm <- svm_opt_ebma
}
# Classifier 6: MRP
# Fit model
if (isTRUE(mrp.include == TRUE)) {
# Create model formula
# Individual-level random effects
L1_re <- paste(paste("(1 | ", L1.x, ")", sep = ""), collapse = " + ")
# Geographic unit or geographic unit-geographic region random effects
if (is.null(L2.reg)) {
L2_re <- paste("(1 | ", L2.unit, ")", sep = "")
} else {
L2_re <- paste(
paste("(1 | ", L2.reg, "/", L2.unit, ")", sep = ""),
collapse = " + "
)
}
# Combine all random effects
all_re <- paste(c(L1_re, L2_re), collapse = " + ")
# Context-level fixed effects
if (is.null(mrp.L2.x)) {
L2_fe <- paste(L2.x, collapse = " + ")
} else {
if (length(mrp.L2.x) == 1) {
if (mrp.L2.x == "empty") {
L2_fe <- ""
} else {
L2_fe <- paste(mrp.L2.x, collapse = " + ")
}
} else {
L2_fe <- paste(mrp.L2.x, collapse = " + ")
}
}
# std MrP formula
form_mrp <- as.formula(paste(y, " ~ ", L2_fe, " + ", all_re, sep = ""))
# fit model for EBMA
mrp_model_ebma <- best_subset_classifier(
y = y,
model = form_mrp,
data.train = data,
model.family = binomial(link = "probit"),
model.optimizer = "bobyqa",
n.iter = 1000000,
verbose = verbose
)
# fit MrP model for post-stratification only
mrp_model_poststrat_only <- best_subset_classifier(
y = y,
model = form_mrp,
data.train = no_ebma_data,
model.family = binomial(link = "probit"),
model.optimizer = "bobyqa",
n.iter = 1000000,
verbose = verbose
)
# post-stratification
mrp_preds <- census %>%
dplyr::mutate(
mrp = stats::predict(
object = mrp_model_poststrat_only,
newdata = .,
allow.new.levels = TRUE,
type = "response"
)
) %>%
dplyr::group_by(!! rlang::sym(L2.unit)) %>%
dplyr::summarize(
mrp = stats::weighted.mean(x = mrp, w = prop), .groups = "keep"
) %>%
dplyr::ungroup()
# individual level predictions for EBMA
mrp_ind <- stats::predict(object = mrp_model_ebma, type = "response")
# model for EBMA
models$mrp <- mrp_model_ebma
}
# --------------------------- combine l2 level predictions ------------------
# tibble of L2 units
L2_preds <- dplyr::select(census, one_of(L2.unit)) %>%
dplyr::distinct()
# add existing classifiers
if (exists("bs_preds")) {
L2_preds <- dplyr::left_join(
x = L2_preds,
y = bs_preds,
by = L2.unit
)
}
if (exists("lasso_preds")) {
L2_preds <- dplyr::left_join(
x = L2_preds,
y = lasso_preds,
by = L2.unit
)
}
if (exists("pca_preds")) {
L2_preds <- dplyr::left_join(
x = L2_preds,
y = pca_preds,
by = L2.unit
)
}
if (exists("gb_preds")) {
L2_preds <- dplyr::left_join(
x = L2_preds,
y = gb_preds,
by = L2.unit
)
}
if (exists("svm_preds")) {
L2_preds <- dplyr::left_join(
x = L2_preds,
y = svm_preds,
by = L2.unit
)
}
if (exists("mrp_preds")) {
L2_preds <- dplyr::left_join(
x = L2_preds,
y = mrp_preds,
by = L2.unit
)
}
# individual predictions for EBMA
L1_preds <- data %>%
dplyr::select(one_of(y)) %>%
tidyr::drop_na() %>%
# add best subset
dplyr::mutate(
best_subset = if (exists("bs_ind")) {
as.numeric(bs_ind)
} else {
NA
},
# add lasso
lasso = if (exists("lasso_ind")) {
as.numeric(lasso_ind)
} else {
NA
},
# add pca
pca = if (exists("pca_ind")) {
as.numeric(pca_ind)
} else {
NA
},
# add gb
gb = if (exists("gb_ind")) {
as.numeric(gb_ind)
} else {
NA
},
# add svm
svm = if (exists("svm_ind")) {
as.numeric(svm_ind)
} else {
NA
},
# add MrP
mrp = if (exists("mrp_ind")) {
as.numeric(mrp_ind)
} else {
NA
}
)
# remove NA's
L1_preds <- L1_preds[, apply(
X = L1_preds, MARGIN = 2, FUN = function(x) {
all(!is.na(x))
}
)]
# Function output
return(
ps = list(
predictions = list(
Level1 = L1_preds,
Level2 = L2_preds
),
models = models
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.