#' Match two groups on a single variable using bootstrapping
#'
#' @param data a dataframe
#' @param y a column in `data` with two values or a logical expression that can
#' be evaluated inside of `data` to determine groups.
#' @param x a column in `data`
#' @param caliper a caliper constraint to set on the matching. Only matches with
#' `abs(x[1] - x[2]) <= caliper` are legal. A caliper of 0 would enforce exact
#' matches.
#' @param boot number of bootstrap samples to take. Defaults to 100.
#' @param id an optional column in `data` with a unique ID for each row in
#' `data`. Matches will be provided using these IDs instead of row numbers.
#' @return the best bootstrapped match. The best match has the largest number of
#' matched pairs and the smallest _z_-score difference on `x` between the two
#' matched subgroups.
#' @export
boot_match_univariate <- function(data, y, x, caliper = 0, boot = 100, id = NULL) {
y <- enquo(y)
x <- enquo(x)
id <- enquo(id)
x_name <- quo_name(x)
x_missing <- is.na(eval_tidy(x, data))
if (validate_grouping(data, !! y)) {
y_vals <- eval_tidy(y, data)
# If y is a character vector,
# treat it as a factor for the purposes of numbering groups
if (is.character(y_vals)) {
y <- quo(as.factor(!! y))
}
}
data <- tibble::rowid_to_column(data, ".rowid")
df_matching <- data %>%
dplyr::filter(!is.na(!! x)) %>%
dplyr::mutate(.group = as.numeric(!! y)) %>%
dplyr::select(.data$.rowid, .data$.group, !! x)
# Keep only participants within +- caliper values of each other
df_template <- find_caliper_pairs(df_matching, .data$.group, !! x, caliper)
matchable_rows <- unique(c(df_template$.rowid1, df_template$.rowid2))
df_unmatchable <- df_matching[!c(df_matching$.rowid %in% matchable_rows), ]
# Do a bunch of matches
tries <- purrr::rerun(boot, bootstrap_match(df_template))
# Keep biggest N's
best_n <- max(purrr::map_int(tries, nrow))
best_tries <- tries %>%
purrr::keep(~ nrow(.x) == best_n)
# The balance function later on wants a formula
f <- rlang::new_formula(quote(.group), rlang::get_expr(x))
# Keep smallest z's
z_scores <- best_tries %>%
purrr::map(function(try) c(try$.rowid1, try$.rowid2)) %>%
purrr::map(~ df_matching[c(df_matching$.rowid %in% .x), ]) %>%
purrr::map(~ RItools::xBalance(f, data = .x, report = "z")) %>%
purrr::map_dbl(purrr::pluck, "results", 1)
best_match <- which.min(abs(z_scores))
message("Z-score difference for ", best_n,
ngettext(best_n, " pair: ", " pairs: "),
round(z_scores[best_match], 3))
# Figure out matching status
df_pairs <- best_tries[[best_match]]
data[c(df_pairs$.rowid1, df_pairs$.rowid2), "Matching"] <- "matched"
data[df_unmatchable$.rowid, "Matching"] <- "unmatchable"
data[x_missing, "Matching"] <- "missing-data"
data[is.na(data$Matching), "Matching"] <- "unmatched"
# Attach ID's of matches, defaulting to row numbers if necessary
if (rlang::quo_is_null(id)) id <- sym(".rowid")
id_vals <- dplyr::pull(data, !! id)
stopifnot(length(id_vals) == length(unique(id_vals)))
df_pairs$.match_id1 <- id_vals[df_pairs$.rowid1]
df_pairs$.match_id2 <- id_vals[df_pairs$.rowid2]
d1 <- df_pairs %>%
dplyr::select(
.rowid = .data$.rowid1,
!! quo_name(id) := .data$.match_id1,
Matching_MatchID = .data$.match_id2)
d2 <- df_pairs %>%
dplyr::select(
.rowid = .data$.rowid2,
!! quo_name(id) := .data$.match_id2,
Matching_MatchID = .data$.match_id1)
join_by <- unique(c(".rowid", quo_name(id)))
data %>%
dplyr::left_join(dplyr::bind_rows(d1, d2), by = join_by) %>%
dplyr::select(-.data$.rowid) %>%
dplyr::select(dplyr::everything(),
dplyr::one_of(c("Matching", "Matching_MatchID")))
}
bootstrap_match <- function(matchable) {
done <- dplyr::slice(matchable, 0L)
while (nrow(matchable) > 0L) {
next_match <- dplyr::sample_n(matchable, 1L)
matchable <- matchable[matchable$.rowid1 != next_match$.rowid1 &
matchable$.rowid2 != next_match$.rowid2, ]
done <- dplyr::bind_rows(done, next_match)
}
done
}
find_caliper_pairs <- function(data, group, x, caliper = 0) {
x <- enquo(x)
group <- enquo(group)
# Need to be able to uniquely identify rows
if (!rlang::has_name(data, ".rowid")) {
data <- tibble::rowid_to_column(data, ".rowid")
}
# Create a dummy ".join" column so that every row can be matched to any
# other row. Then split the groups.
each_group <- data %>%
dplyr::mutate(.join = 1) %>%
split(dplyr::pull(data, !! group))
# Uniquely name the columns in each group
each_group[[1]] <- each_group[[1]] %>%
dplyr::rename(
.group1 = !! group,
.rowid1 = .data$.rowid,
.x1 = !! x)
each_group[[2]] <- each_group[[2]] %>%
dplyr::rename(
.group2 = !! group,
.rowid2 = .data$.rowid,
.x2 = !! x)
# Generate all possible matches then keep only those that satisfy the
# caliper constraint
dplyr::full_join(each_group[[1]], each_group[[2]], by = ".join") %>%
dplyr::mutate(diff = .data$.x1 - .data$.x2) %>%
dplyr::filter(abs(diff) <= caliper) %>%
dplyr::select(.data$.rowid1, .data$.rowid2, .data$diff)
}
# Confirm that the y variable yields exactly 2 unique values
validate_grouping <- function(data, y) {
y <- enquo(y)
y_vals <- eval_tidy(y, data)
if (length(unique(y_vals)) != 2) {
values <- as.list(dplyr::distinct(data, !! y))
comma_sep <- paste0(values[[1]], collapse = ", ")
msg <- glue::glue(
"Groupings should yield two values.\n",
"{glue::backtick(names(values))} has values: {comma_sep}")
stop(msg, call. = FALSE)
}
TRUE
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.