Nothing
#' Preprocess the input to `wcls()`, `emee()`, or `emee2()`
#'
#' @param data A data set in long format.
#' @param mf An object constructed using `match.call()`.
#' @param outcome_type If `"binary"`, an additional check will be performed to
#' ensure the outcome only takes value 0 or 1.
#' @param verbose If default (`TRUE`), additional messages will be printed
#' during data preprocessing.
#'
#' @return Returns a list of preprocessed variables: id, outcome, treatment,
#' moderator_matrix, control_matrix,availability, numerator_prob.
#'
#' @noRd
preprocess_input <- function(data, mf, outcome_type, verbose = TRUE) {
if ("tibble" %in% class(data)) {
warning("The function may not work for data in tibble format. Use as.data.frame() to convert into data frame.")
}
# creating id, outcome, treatment
stopifnot(class(mf$id) == "character")
id <- data[[mf$id]]
stopifnot(class(mf$outcome) == "character")
outcome <- data[[mf$outcome]]
stopifnot(class(mf$treatment) == "character")
treatment <- data[[mf$treatment]]
# creating availability
if (is.null(mf$availability)) {
availability <- rep(1, nrow(data))
if (verbose) {
message("availability = NULL: defaulting availability to always available.")
}
} else {
availability <- data[[mf$availability]]
}
# creating rand_prob (randomization probability)
if (is.numeric(mf$rand_prob)) {
# user-input constant randomization probability
rand_prob <- rep(mf$rand_prob, nrow(data))
if (verbose) {
message(paste0("Constant randomization probability ", mf$rand_prob, " is used."))
}
} else if (is.character(mf$rand_prob)) {
rand_prob <- data[[mf$rand_prob]]
} else {
stop(paste(
"rand_prob should be either a single numeric value or a character",
"denoting a column in data."
))
}
# creating and checking moderator variables
moderator_formula <- as.formula(mf$moderator_formula)
parse.fm_mdr <- as.character(mf$moderator_formula)
if (length(parse.fm_mdr) == 2) {
# the formula looks like "~x"
} else if (length(parse.fm_mdr) == 3) {
# the formula looks like "y ~ x"
stop(paste(
"It seems like you included variables to left of ~.",
"moderator_formula should look like ~1 or ~ mod_var1 + mod_var2."
))
} else {
stop(paste(
"Unknown moderator_formula pattern!",
"moderator_formula should look like ~1 or ~ mod_var1 + mod_var2."
))
}
moderator_matrix <- model.matrix(as.formula(mf$moderator_formula), data = data)
# creating and checking control variables
control_formula <- as.formula(mf$control_formula)
parse.fm_ctl <- as.character(mf$control_formula)
if (length(parse.fm_ctl) == 2) {
# the formula looks like "~x"
} else if (length(parse.fm_ctl) == 3) {
# the formula looks like "y ~ x"
stop(paste(
"It seems like you included variables to left of ~.",
"control_formula should look like ~1 or ~ ctrl_var1 + ctrl_var2."
))
} else {
stop(paste(
"Unknown control_formula pattern!",
"control_formula should look like ~1 or ~ ctrl_var1 + ctrl_var2."
))
}
control_matrix <- model.matrix(as.formula(mf$control_formula), data = data)
# creating and checking numerator_prob (numerator probability)
if (is.null(mf$numerator_prob)) {
numerator_prob <- rep(0.5, nrow(data))
numerator_prob_in_formula <- 0.5
if (verbose) {
message(paste0("Constant numerator probability ", numerator_prob_in_formula, " is used."))
}
} else if (is.numeric(mf$numerator_prob)) {
stopifnot(length(mf$numerator_prob) == 1)
numerator_prob <- rep(mf$numerator_prob, nrow(data))
numerator_prob_in_formula <- mf$numerator_prob
if (verbose) {
message(paste0("Constant numerator probability ", numerator_prob_in_formula, " is used."))
}
} else if (is.character(mf$numerator_prob)) {
numerator_prob <- data[[mf$numerator_prob]]
numerator_prob_in_formula <- mf$numerator_prob
if (length(unique(numerator_prob)) > 1) {
if (verbose) {
message(paste(
"Non-constant numerator probability is used.",
"Make sure you know how to appropriately choose a non-constant numerator probability",
"(that it can only depend on the moderators).",
"Otherwise the estimated causal effect can be biased."
))
}
}
} else {
stop(paste(
"numerator_prob type mismatch.",
"numerator_prob must be either NULL,",
"or a single numeric number,",
"or a character denoting a column in data."
))
}
# checking variable types
stopifnot(is.numeric(outcome))
stopifnot(is.numeric(treatment))
stopifnot(is.numeric(rand_prob))
stopifnot(is.numeric(moderator_matrix))
stopifnot(is.numeric(control_matrix))
stopifnot(is.numeric(availability))
stopifnot(is.numeric(numerator_prob))
# more checking for availability
if (any(is.na(availability))) {
stop("NA in availability variable. This package is unable to handle this.")
}
if (!all(availability %in% 0:1)) {
stop("availability variable should only contain 0 or 1.")
}
# checking NA in other variables among availability time points
index_avail <- which(availability == 1)
if (any(is.na(outcome[index_avail]))) {
stop("NA in outcome. This package is unable to handle this.")
}
if (any(is.na(treatment[index_avail]))) {
stop("NA in treatment. This package is unable to handle this.")
}
if (any(is.na(rand_prob[index_avail]))) {
stop("NA in rand_prob. This package is unable to handle this.")
}
if (any(is.na(moderator_matrix[index_avail, ]))) {
stop("NA in moderator variables. This package is unable to handle this.")
}
if (any(is.na(control_matrix[index_avail, ]))) {
stop("NA in control variables. This package is unable to handle this.")
}
if (any(is.na(numerator_prob[index_avail]))) {
stop("NA in numerator_prob. This package is unable to handle this.")
}
# checking variables that should be binary among availability time points
if (!all(treatment[index_avail] %in% 0:1)) {
stop("treatment variable should only contain 0 or 1.")
}
if (any(rand_prob[index_avail] <= 0) | any(rand_prob[index_avail] >= 1)) {
stop("rand_prob should be greater than 0 and less than 1.")
}
if (any(numerator_prob[index_avail] <= 0) | any(numerator_prob[index_avail] >= 1)) {
stop("rand_prob should be greater than 0 and less than 1.")
}
if (outcome_type == "binary") {
if (!all(outcome[index_avail] %in% 0:1)) {
stop("outcome variable should only contain 0 or 1.")
}
}
list(
id = id,
outcome = outcome,
treatment = treatment,
rand_prob = rand_prob,
moderator_matrix = moderator_matrix,
control_matrix = control_matrix,
availability = availability,
numerator_prob = numerator_prob,
numerator_prob_in_formula = numerator_prob_in_formula
# numerator_prob_in_formula is used in wcls()
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.