Nothing
#' Load and format data object
#'
#' @param time A character string; the name of the numeric variable representing
#' time. Time can be either discrete or continuous.
#' @param cluster_id A character string; the name of the numeric variable
#' identifying the cluster.
#' @param individual_id A character string (optional); the name of the numeric
#' variable identifying the individual.
#' @param treatment A character string; the name of the binary variable
#' indicating treatment. Values must be either integers (0/1) or Boolean
#' (T/F).
#' @param outcome Either a character string or a vector of two character strings;
#' for a numeric or binary outcome, the single character string indicates
#' the name of the numeric or binary outcome variable; for binomial outcome
#' data, the vector of two character strings indicates the "# of successes"
#' variable and the "# of trials" variable, respectively. Values in the
#' outcome variable(s) must be either numeric or Boolean (T/F).
#' @param data A dataframe containing the stepped wedge trial data.
#' @param time_type One of c("discrete", "continuous"); whether the model treats
#' time as discrete or continuous.
#'
#' @return An object of class \code{sw_data}
#' @export
#'
#' @examples
#' example_data <- load_data(time ="period", cluster_id = "cluster", individual_id = NULL,
#' treatment = "trt", outcome = "outcome_cont", data = sw_data_example)
#' base::summary(example_data)
#'
#'
load_data <- function(
time, cluster_id, individual_id = NULL, treatment,
outcome, time_type = "discrete", data
) {
# To prevent R CMD CHECK notes
.time <- .cluster_id <- .individual_id <- .successes <- .trials <- first_exposure <- NULL
rm(.time,.cluster_id,.individual_id,.successes,.trials,first_exposure)
outcome_length <- length(outcome)
outcome_binomial <- dplyr::case_when(
outcome_length == 1 ~ FALSE,
outcome_length == 2 ~ TRUE
)
if (outcome_binomial == TRUE) {
successes <- outcome[1]
trials <- outcome[2]
outcome <- NULL
} else {
successes <- NULL
trials <- NULL
}
# Input validation
{
if (!methods::is(data,"data.frame")) { stop("`data` must be a data frame.") }
if (methods::is(data,"tbl_df")) { stop("`data` must be a non-tibble data frame.") }
if (nrow(data)==0) { stop("`data` is an empty data frame.") }
# Validate: `time_type`
if (!(time_type %in% c("discrete", "continuous"))) {
stop(paste0(
"`time_type` must be a character string specifying `discrete` or `continuous`."
))
}
for (arg in c("time", "cluster_id", "treatment",
"individual_id", "outcome", "successes", "trials")) {
var <- get(arg)
# Variable is a character string specifying variable(s) in `data`
is_string <- methods::is(var, "character")
length_one <- as.logical(length(var) == 1)
in_df <- all(as.logical(var %in% names(data)))
if (arg == "individual_id") {
if (!is.null(individual_id) && !(is_string && length_one && in_df)) {
stop(
paste0(
"`",
arg,
"` must be a character string specifying a s",
"ingle variable in `data`."
)
)
}
} else if (arg %in% c("outcome", "successes", "trials")) {
if (!is.null(var) && !(is_string && length_one && in_df)) {
stop(
paste0(
"`",
arg,
"` must be a character string specifying a s",
"ingle variable in `data`."
)
)
}
} else {
if (!(is_string && length_one && in_df)) {
stop(
paste0(
"`",
arg,
"` must be a character string specifying a s",
"ingle variable in `data`."
)
)
}
}
# Assign column(s) to val
val <- data[, var]
# Validate: `time`
if (arg %in% c("time")) {
if (!is.numeric(val)) {
stop(paste0("`", arg, "` must be numeric."))
}
}
# Validate: `treatment`
if (arg %in% c("treatment")) {
if (any(!(val %in% c(0, 1, F, T, NA)))) {
stop(paste0(
"`",
arg,
"` must only contain binary values (either T",
"/F or 1/0)."
))
}
}
# Validate: `outcome` for non-binomial data
if (arg %in% c("outcome") & outcome_binomial == FALSE) {
if (any(!(val %in% c(0, 1, F, T) | is.numeric(val)))) {
stop(paste0(
"`",
arg,
"` must only contain numeric or binary values (either T",
"/F or 1/0)."
))
}
}
# Validate: `successes` and `trials` for binomial data
if (arg %in% c("successes", "trials") & outcome_binomial == TRUE) {
if (any(!is.numeric(val))) {
stop(paste0(
"`",
arg,
"` must only contain numeric values."
))
}
}
assign(x = paste0(".", arg), value = val)
}
# Validate: `successes` <= `trials` for each observation
if (outcome_binomial == TRUE) {
if (any(successes > trials)) {
stop(paste0(
"`successes` must be less than or equal to `trials` for all observations."
))
}
}
}
# Convert binary variables to integers (if specified as boolean)
if(typeof(.outcome) == "logical") {.outcome <- as.integer(.outcome)}
if(typeof(.treatment) == "logical") {.treatment <- as.integer(.treatment)}
# Create data object
dat <- data.frame(
"outcome" = .outcome,
"successes" = .successes,
"trials" = .trials,
"time" = .time,
"cluster_id" = .cluster_id,
"individual_id" = .individual_id,
"treatment" = .treatment,
"time_type" = time_type
)
# Test whether, for all observations with a particular value of cluster_id and time, the value of treatment is the same
# If not, throw an error
# if (any(duplicated(dat[, c("cluster_id", "time")]) & duplicated(dat[, c("cluster_id", "time", "treatment")]))) {
# stop("For each unique combination of `cluster_id` and `time`, the value of `treatment` must be the same.")
# }
dat2 <- dat %>%
dplyr::group_by(cluster_id, time) %>%
dplyr::mutate(avg_tx=mean(treatment))
if (any(!(dat2$avg_tx %in% c(0,1)))) {
stop("Value of `treatment` variable must be the same for all observations in a given cluster-period.")
}
# Handle missing values
# Ignore everything related to covariates for now, only check necessary columns for missingness
# Give warning if anything necessary missing (x/y records contain missing data and are being dropped)
dat_no_missing <- dat[stats::complete.cases(dat$outcome, dat$time, dat$cluster_id, dat$treatment), ]
# Count number of rows dropped due to missing values
num_dropped <- nrow(dat) - nrow(dat_no_missing)
num_total <- nrow(dat)
# Order cluster id factor levels by roll-out sequence of intervention,
# and calculate exposure time for each cluster-period
dat_return <- dat_no_missing %>%
dplyr::group_by(cluster_id) %>%
# create variable for first sw_step where each index_ward has no_we_exposure == 1
dplyr::mutate(first_exposure = min(time[treatment == 1])) %>%
dplyr::ungroup() %>%
dplyr::mutate(cluster_id = forcats::fct_reorder(factor(cluster_id), first_exposure)) %>%
dplyr::mutate(exposure_time = ifelse(treatment == 1,
time - first_exposure + 1,
0)) %>%
dplyr::arrange(cluster_id) # necessary for some GEE analysis functions
# Add attributes and class to data object, return data object
n_clusters <- length(unique(dat_return$cluster_id))
n_times <- length(unique(dat_return$time))
n_seq <- length(unique(dat_return$first_exposure))
attr(dat_return, "n_clusters") <- n_clusters
attr(dat_return, "n_times") <- n_times
attr(dat_return, "n_seq") <- n_seq
attr(dat_return, "binomial") <- outcome_binomial
class(dat_return) <- c("data.frame", "sw_dat")
msg <- paste0(
"Stepped wedge dataset loaded. ",
stringr::str_to_title(time_type),
" time design with ",
n_clusters,
" clusters, ",
n_seq,
" sequences, and ",
n_times,
" time points."
)
if (num_dropped>0) {
msg <- paste0(
msg, " Note: ", num_dropped, "/", num_total,
" rows were dropped due to missing values for `cluster_id`, `time`, `treatment`, or `outcome`."
)
}
message(msg)
return(dat_return)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.